In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from spacy.displacy import render

In [2]:
stop_word = list(STOP_WORDS)
print(stop_word[:10])

['and', 'next', 'your', 'below', 'just', 'wherever', 'otherwise', '‘ll', 'always', 'should']


In [3]:
# load English pre train small model
nlp = spacy.load("en_core_web_sm")

In [4]:
df1 = pd.read_csv("sentiment labelled sentences/amazon_cells_labelled.txt",sep="\t", header=None)
df2 = pd.read_csv("sentiment labelled sentences/imdb_labelled.txt",sep="\t",header=None)
df3 = pd.read_csv("sentiment labelled sentences/yelp_labelled.txt", sep= "\t",header= None)

In [5]:
df1.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
df2.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [7]:
df3.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
# Shape of all three columns

print(df1.shape,df2.shape, df3.shape)

(1000, 2) (748, 2) (1000, 2)


In [9]:
df = pd.concat([df1,df2,df3],axis=0)

In [10]:
df.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


# create final dataset to merge all deta
df = df1.append(df2)
df = df.append(df3)

print("Final shape of dataset",df.shape)

In [11]:
# Rename the column name
df.rename({0:"Review",1:"Sentiment"},axis=1,inplace=True)

In [12]:
# Check Null Values

df.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [13]:
# Now see distribution of classes

df["Sentiment"].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [14]:
# Now Clean the data

def clean_data(text):
    doc = nlp(text)
    list_tokens = []
    clean_tokens = []
    
    for token in doc:
        if token.lemma_ != "-PRON-":
            tem_token = token.lemma_.lower().strip()
        else:
            tem_token = token.lower()
        list_tokens.append(tem_token)
        
    for token in list_tokens:
        if token not in stop_word and token not in punctuation:
            clean_tokens.append(token)
    
    return clean_tokens

In [15]:
clean_data("    Hello how are you. Like this video")

['hello', 'like', 'video']

In [16]:
df["clean_Reviews"] = df["Review"].apply(clean_data)

In [17]:
# now we are doing vectorization

tfidf = TfidfVectorizer(tokenizer= clean_data)

In [18]:
# machine learning model
l_svm = LinearSVC()

In [19]:
# Split data into x and y 
x = df["Review"]
y = df["Sentiment"]

In [20]:
# Now split data into train test split 

X_train,X_test, y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=2023)

In [21]:
X_train.shape,X_test.shape, y_train.shape,y_test.shape

((2198,), (550,), (2198,), (550,))

In [22]:
pipe_line = Pipeline([("tfidf",tfidf),("classifier",l_svm)])

In [23]:
pipe_line.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function clean_data at 0x00000185098E5310>)),
                ('classifier', LinearSVC())])

In [24]:
Pipeline(memory= None,
        steps = [("tfidf",TfidfVectorizer(analyzer="word", binary=False, decode_error="strict",
                                          encoding = "utf-8", input = "content",
                                         lowercase = True, max_df = 1.0, max_features = None, min_df = 1, ngram_range = (1,1),
                                         norm = "l2", preprocessor = None, smooth_idf = True, stop_words = None, strip_accents = None,
                                         sublinear_tf = False, token_pattern = "(?u)\\u\\w\\w+\\b", 
                                         use_idf = True, vocabulary = None)),
                ("clf",LinearSVC(C = 1.0, class_weight=None, dual= True, fit_intercept=True, intercept_scaling=1,
                                loss = "squared_hinge", max_iter=1000, multi_class="ovr", penalty = 'l2',random_state=None,
                                tol=0.0001, verbose=0))],verbose=False)

Pipeline(steps=[('tfidf', TfidfVectorizer(token_pattern='(?u)\\u\\w\\w+\\b')),
                ('clf', LinearSVC())])

In [25]:
y_pred = pipe_line.predict(X_test)

In [26]:
print(accuracy_score(y_test,y_pred))

0.8


In [27]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       262
           1       0.82      0.79      0.80       288

    accuracy                           0.80       550
   macro avg       0.80      0.80      0.80       550
weighted avg       0.80      0.80      0.80       550



In [28]:
pipe_line.predict(["wow, This is amazing not lesson"])

array([1], dtype=int64)

In [29]:
inputs = input("enter sentence")
print(pipe_line.predict([inputs]))

enter sentenceg
[0]


In [30]:
# save model 

In [31]:
import pickle

In [32]:
model = open("final_model.sav","wb")

In [33]:
pickle.dump(pipe_line,model)

In [34]:
model = pickle.load(open("final_model.sav","rb"))

EOFError: Ran out of input

In [None]:
model.predict(["my name is khan"])