# SENTIMENT ANALYSIS 



In [38]:
import pandas as pd 
import nltk# as nlkt
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer


In [39]:
df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [40]:
print(len(df))

50000


In [41]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [42]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [43]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

In [44]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [45]:
#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [46]:
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))



In [10]:
print(df['Processed_Reviews'][0])
print(df.head())

one reviewer ha mention watch 1 oz episode hook right exactly happen first thing strike oz wa brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy deal shady agreement never far away would say main appeal show due fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess around first episode ever saw strike nasty wa surreal say wa ready watch develop taste oz get accustom high level graphic violence violence injustice crook guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison experience watch oz may become comfort

## SVM Model

In [47]:
x = df['Processed_Reviews']
y = df['sentiment']

#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)



In [48]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))


#instantiate the model (using the default parameters)
SVM = SVC()



In [49]:
# fit the model with pre-processed data
SVM.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5035
           1       0.86      0.89      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:

joblib.dump(SVM, 'saved_model.pkl')



['saved_model.pkl']

In [None]:


#Creating a Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('SVM', SVC())
])

#Defining hyperparameters
parameters = {
    'vect__max_df':[0.1,0.2,0.3,0.4,0.5,0.6],
    'vect__ngram_range':  [(1,1), (1,2), (1,3)],
    'SVM__kernel': ['poly', 'rbf', 'sigmoid'],
    'SVM__C': [50, 10, 1.0, 0.1]}

# define grid search
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit = True, verbose = 3, cv=5)
grid_result = grid_search.fit(df.loc[:700, 'Processed_Reviews'].values.astype('U'), df.loc[:700, 'sentiment'].values.astype('U'))


[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.636 total time=   0.8s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.536 total time=   0.9s
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.557 total time=   0.8s
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.636 total time=   0.9s
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.496 total time=   1.3s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.543 total time=   1.3s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.486 total time=   1.3s
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.514 total time=   1.3s
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.801763 using {'SVM__C': 10, 'SVM__kernel': 'sigmoid', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 3)}
0.553435 (0.028901) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 1)}
0.496454 (0.030794) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 2)}
0.486454 (0.007270) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 3)}
0.591884 (0.047667) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 1)}
0.520729 (0.024766) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 2)}
0.486454 (0.009678) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 3)}
0.629129 (0.044924) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 1)}
0.537842 (0.036823) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2), max_df=0.5).fit(df['Processed_Reviews'].values.astype('U'))

bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))
SVM_optimized = SVC(C= 50,kernel='rbf')

In [None]:

# fit the model with pre-processed data
SVM_optimized.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM2 = SVM_optimized.predict(bow_test)
print(classification_report(y_test, predicted_SVM2))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      5035
           1       0.87      0.90      0.89      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [None]:
joblib.dump(SVM_optimized, 'saved_model_SVMOpt.pkl')



['saved_model_SVMOpt.pkl']

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2), max_df=0.5).fit(df['Processed_Reviews'].values.astype('U'))

bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))
SVM_optimized2 = SVC(C= 50,kernel='sigmoid')

In [None]:
# fit the model with pre-processed data
SVM_optimized2.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM2 = SVM_optimized2.predict(bow_test)
print(classification_report(y_test, predicted_SVM2))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77      5035
           1       0.76      0.77      0.77      4965

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



In [None]:
joblib.dump(SVM_optimized2, 'saved_model_SVMOpt2.pkl')

['saved_model_SVMOpt2.pkl']

In [12]:
count_vect = CountVectorizer(ngram_range=(1,3), max_df=0.2).fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

SVM_optimized3 = SVC(C= 50,kernel='rbf')

In [13]:
SVM_optimized3.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM3 = SVM_optimized3.predict(bow_test)
print(classification_report(y_test, predicted_SVM3))

              precision    recall  f1-score   support

           0       0.90      0.85      0.88      5035
           1       0.86      0.91      0.88      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [14]:
print(classification_report(y_test, predicted_SVM3))

              precision    recall  f1-score   support

           0       0.90      0.85      0.88      5035
           1       0.86      0.91      0.88      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [15]:
joblib.dump(SVM_optimized3, 'saved_model_SVMOpt3.pkl')


['saved_model_SVMOpt3.pkl']

In [16]:
count_vect = CountVectorizer(ngram_range=(1,3), max_df=0.4).fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

SVM_optimized4 = SVC(C= 1,kernel='sigmoid')

In [17]:
SVM_optimized4.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM4 = SVM_optimized4.predict(bow_test)
print(classification_report(y_test, predicted_SVM4))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      5035
           1       0.87      0.88      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [18]:
joblib.dump(SVM_optimized4, 'saved_model_SVMOpt4.pkl')

['saved_model_SVMOpt4.pkl']

## Load models

In [32]:
svm = joblib.load('saved_model.pkl')
#rna = joblib.load('LSTM.pkl')
svmMorOpt = joblib.load('saved_model_SVMOpt.pkl')
svmOpt = joblib.load('saved_model_SVMOpt2.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [50]:
test_1 = ['I would highly recommend this movie to anyone. Its a must-watch!']
test_1 = count_vect.transform(test_1).toarray()


#Printing prediction

print(SVM.predict(test_1))
#print(SVM.predict(test_1))



[1]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7a5b1e8f-5fb8-49ad-8f36-77068147d699' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>