In [None]:
import pandas as pd

df_review = pd.read_csv('IMDBDataset.csv')
df_review

: 

In [None]:

#taking a smaller sample of 10000 rows to make processing faster and get imbalance data
# 9000 positives
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
# 1000 positives
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

df_review_imb = pd.concat([df_positive, df_negative])
df_review_imb.value_counts(['sentiment'])

: 

In [None]:

# option 2
length_negative = len(df_review_imb[df_review_imb['sentiment']=='negative'])
df_review_positive = df_review_imb[df_review_imb['sentiment']=='positive'].sample(n=length_negative)
df_review_non_positive = df_review_imb[~(df_review_imb['sentiment']=='positive')]

df_review_bal = pd.concat([
    df_review_positive, df_review_non_positive
])
df_review_bal.reset_index(drop=True, inplace=True)
df_review_bal['sentiment'].value_counts()

: 

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

: 

In [None]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector

: 

In [None]:
pd.DataFrame.sparse.from_spmatrix(train_x_vector,
                                index=train_x.index,
                                columns=tfidf.get_feature_names())

: 

In [None]:
test_x_vector = tfidf.transform(test_x)

: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector

: 

In [None]:
pd.DataFrame.sparse.from_spmatrix(train_x_vector,
                                index=train_x.index,
                                columns=tfidf.get_feature_names_out())

: 

In [None]:
test_x_vector = tfidf.transform(test_x)


: 

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel="linear")
svc.fit(train_x_vector, train_y)

: 

In [None]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all'])))

: 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)


: 

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

: 

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

: 

In [None]:
# svc.score('Test samples', 'True labels')
svc.score(test_x_vector, test_y)
dec_tree.score(test_x_vector, test_y)
gnb.score(test_x_vector.toarray(), test_y)
log_reg.score(test_x_vector, test_y)

: 

In [None]:
from sklearn.metrics import f1_score
f1_score(test_y, svc.predict(test_x_vector),
        labels=['positive', 'negative'],
        average=None)

: 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y, 
                            svc.predict(test_x_vector),
                            labels=['positive', 'negative']))

: 

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(test_y, 
                            svc.predict(test_x_vector), 
                            labels=['positive', 'negative'])

: 

In [None]:
from sklearn.model_selection import GridSearchCV
#set the parameters
parameters = {"C": [1,4,8,16,32] ,"kernel":["linear", "rbf"]}
svc = SVC()
svc_grid = GridSearchCV(svc,parameters, cv=5)

svc_grid.fit(train_x_vector, train_y)

: 

In [None]:
print(svc_grid.best_params_)
print(svc_grid.best_estimator_)

: 