In [66]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [67]:
import numbers
import pandas as pd

In [68]:
data = pd.read_csv('moviereviews.csv')
data

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [69]:
data.isnull().sum()

label      0
review    35
dtype: int64

In [70]:
data.dropna(inplace=True) # data = data.sdad 

In [71]:
data.isnull().sum()

label     0
review    0
dtype: int64

In [72]:
data['review'].str.isspace().sum()

np.int64(27)

In [73]:
data['label'].str.isspace().sum()

np.int64(0)

In [74]:
# those values where i have space
index = data['review'].str.isspace()
data[~index]

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [75]:
data = data[~index]

In [76]:
data['review'].str.isspace().sum()

np.int64(0)

In [77]:
data['label'].value_counts()

label
neg    969
pos    969
Name: count, dtype: int64

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
cv = CountVectorizer(stop_words='english')

In [80]:
cv

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [81]:
data[data['label']=='neg']   ['review']

0       how do films like mouse hunt get into theatres...
1       some talented actresses are blessed with a dem...
4       my first press screening of 1998 and already i...
5       to put it bluntly , ed wood would have been pr...
6       synopsis : melissa , a mentally-disturbed woma...
                              ...                        
1985    the real blonde ( r ) a woman's face , an arm ...
1986     * * * the following review contains spoilers ...
1987     " book " should have remained in shadows \r\n...
1991    all right , all right , we get the point : des...
1992    say , tell me if you've seen this before : a c...
Name: review, Length: 969, dtype: object

In [82]:
matrix = cv.fit_transform( data[data['label']=='neg']   ['review']   )
matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 223948 stored elements and shape (969, 27473)>

In [83]:
i = [100,200,300]
j = [111,222,333]
for a,b in zip(i,j):
    print(b)

111
222
333


In [84]:
freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest

print("Top 20 words used for Negative reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words used for Negative reviews.
[('film', 4063), ('movie', 3131), ('like', 1808), ('just', 1480), ('time', 1127), ('good', 1117), ('bad', 997), ('character', 926), ('story', 908), ('plot', 888), ('characters', 838), ('make', 813), ('really', 743), ('way', 734), ('little', 696), ('don', 683), ('does', 666), ('doesn', 648), ('action', 635), ('scene', 634)]


In [85]:
matrix = cv.fit_transform( data[data['label']=='pos']   ['review']   )

freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest

print("Top 20 words used for Poseteve reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words used for Poseteve reviews.
[('film', 5002), ('movie', 2389), ('like', 1721), ('just', 1273), ('story', 1199), ('good', 1193), ('time', 1175), ('character', 1037), ('life', 1032), ('characters', 957), ('way', 864), ('films', 851), ('does', 828), ('best', 788), ('people', 769), ('make', 764), ('little', 751), ('really', 731), ('man', 728), ('new', 702)]


In [86]:
from sklearn.model_selection import train_test_split

In [87]:
X = data['review']
y = data['label']


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
X_train

1259    i have a soft spot in my heart for pure , amor...
1366    that thing you do ! \r\n ( r ) tom hanks's scr...
1172    the uncompromising nudity bared throughout pet...
379     in 1989 , director edward zwick began his care...
200     when jim henson passed away , he left behind d...
                              ...                        
1654    its a stupid little movie that trys to be clev...
1923    i can already feel the hate letters pouring in...
1404    there was probably a good reason that the warn...
1602    martin scorsese's triumphant adaptation of edi...
893     it has been three long years since quentin tar...
Name: review, Length: 1550, dtype: object

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from sklearn.naive_bayes import MultinomialNB

In [90]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('svc', LinearSVC()),])
pipe

0,1,2
,steps,"[('tfidf', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [92]:
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [101]:
pipe.predict(['The film was average bot the ending was good'])

array(['pos'], dtype=object)