In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data = pd.read_csv("Additional reviews.csv")
data

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
2,city_hunter_shinjuku_private_eyes,2590987,2019-05-28,Reuben Baron,False,,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,https://www.cbr.com/city-hunter-shinjuku-priva...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
4,dangerous_men_2015,2504681,2018-08-29,Pat Padua,False,,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,http://dcist.com/2015/11/out_of_frame_dangerou...
...,...,...,...,...,...,...,...,...,...,...,...
1444958,thor_love_and_thunder,102706151,2022-07-05,Christie Cronan,False,7/10,fresh,Raising Whasians,Solid but not totally sold&#44; Thor&#58; Ragn...,POSITIVE,https://raisingwhasians.com/thor-love-and-thun...
1444959,thor_love_and_thunder,102706150,2022-07-05,Ian Sandwell,False,4/5,fresh,Digital Spy,Thor&#58; Love and Thunder is the most enterta...,POSITIVE,https://www.digitalspy.com/movies/a40496050/th...
1444960,thor_love_and_thunder,102706149,2022-07-05,Lauren LaMagna,False,8/10,fresh,Next Best Picture,&quot;Thor&#58; Love and Thunder&quot; is a st...,POSITIVE,https://www.nextbestpicture.com/thor-love-and-...
1444961,thor_love_and_thunder,102706148,2022-07-05,Jake Cole,True,1/4,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,https://www.slantmagazine.com/film/thor-love-a...


In [None]:
data.drop(["id", "reviewId", "creationDate", "criticName", "isTopCritic", "originalScore", "reviewState",
           "publicatioName", "reviewUrl"], axis=1, inplace=True)
data.head()

Unnamed: 0,reviewText,scoreSentiment
0,Timed to be just long enough for most youngste...,POSITIVE
1,It doesn't matter if a movie costs 300 million...,NEGATIVE
2,The choreography is so precise and lifelike at...,POSITIVE
3,The film's out-of-touch attempts at humor may ...,NEGATIVE
4,Its clumsy determination is endearing and some...,POSITIVE


In [None]:
data.shape, df.shape

((1444963, 2), (50000, 2))

In [None]:
data['sentiment'] = data['scoreSentiment'].str.lower()
data['review'] = data['reviewText']
data = data.drop(['scoreSentiment','reviewText'], axis=1)
data.head()

Unnamed: 0,sentiment,review
0,positive,Timed to be just long enough for most youngste...
1,negative,It doesn't matter if a movie costs 300 million...
2,positive,The choreography is so precise and lifelike at...
3,negative,The film's out-of-touch attempts at humor may ...
4,positive,Its clumsy determination is endearing and some...


In [None]:
df = pd.concat([df, data], ignore_index=True)
df['review'] = df['review'].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
1494958,solid but not totally sold&#44; thor&#58; ragn...,positive
1494959,thor&#58; love and thunder is the most enterta...,positive
1494960,&quot;thor&#58; love and thunder&quot; is a st...,positive
1494961,across taika waititi&#8217;s film&#44; a war a...,negative


In [None]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

tfidf_matrix = vectorizer.fit_transform(df['review'].dropna())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

tfidf_df.head()
tfidf_df.shape

(1425738, 1000)

In [None]:
tfidf_matrix = vectorizer.transform(df['review'].dropna())
tfidf_matrix.shape

(1425738, 1000)

In [None]:
sentiment_resized = df['sentiment'].iloc[:tfidf_matrix.shape[0]].reset_index(drop=True)
sentiment_resized.shape

(1425738,)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(tfidf_matrix, sentiment_resized, test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5, scoring='accuracy')
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_val)
nb_accuracy = accuracy_score(y_val, nb_predictions)
print(f"Naive Bayes Cross-Validation Accuracy: {nb_cv_scores.mean()}")
print(f"Naive Bayes Validation Accuracy: {nb_accuracy}")

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_predictions)
print(f"Random Forest Cross-Validation Accuracy: {rf_cv_scores.mean()}")
print(f"Random Forest Validation Accuracy: {rf_accuracy}")
best_model = 'Naive Bayes' if nb_cv_scores.mean() > rf_cv_scores.mean() else 'Random Forest'
print(f"Best Model: {best_model}")

Naive Bayes Cross-Validation Accuracy: 0.6640045941135728
Naive Bayes Validation Accuracy: 0.6649318950159215


In [None]:
classification_report = classification_report(y_val, rf_predictions)
print(f"Classification Report: {classification_report}")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(tfidf_matrix, sentiment_resized, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

dt_predictions = dt_model.predict(X_val)
dt_accuracy = accuracy_score(y_val, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")