In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Data Loading

In [None]:
movies=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")
train=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv")
test=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")

EDA

In [None]:
movies.info()

In [None]:
movies.nunique()

In [None]:
movies.isnull().sum()/143258 < 0.5 #columns which have more than 50% non-empty rows

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.isnull().sum()

In [None]:
train["sentiment"].value_counts()

the train dataset is imbalanced

In [None]:
test.info()

In [None]:
test.isnull().sum()

*   movies has duplicate rows with same movieid
*   [**'rating', 'ratingContents', 'releaseDateTheaters', 'boxOffice', 'distributor', 'soundType'**] more than half the values of these columns from the movies dataset are empty
* test has a column named **'isTopCritic'** which is equivalent to the **'isFrequentReviewer'** column from train column


*   **'reviewText'** column has **NAN** values in both train and test datasets
*   **'audienceScore'**,**'runtimeMinutes'** **'genre' 'originalLanguage'**
also have null values
* **Train** dataset has class imbalance
* **Train** and **Test** have to be merged with **Movies**


---





**Assumptions**


*   movies dataset is a universal set for all the movies listed in the train and test dataset









In [None]:
movies.drop_duplicates(subset="movieid",keep="first",inplace=True)


In [None]:
test.rename(columns = {'isTopCritic':'isFrequentReviewer'}, inplace = True)

In [None]:
#joining movies with train and test datasets
trainmovies=pd.merge(train,movies,how="left")
testmovies=pd.merge(test,movies,how="left")

In [None]:
sns.heatmap(trainmovies.corr(numeric_only=True));

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
ax.set_title('sentiment in train dataset')
ax.pie(trainmovies["sentiment"].value_counts(), labels =["POSITIVE","NEGATIVE"],autopct='%1.2f%%');

In [None]:
fig,ax = plt.subplots(1,1)

ax.hist(movies['audienceScore'][movies["audienceScore"].isnull()==False], bins = [0,25,50,75,100])
ax.set_title("histogram of audienceScore")
ax.set_xticks([0,25,50,75,100])
ax.set_xlabel('audienceScore')
ax.set_ylabel('no.of movies from the movies dataset')
plt.show()

In [None]:
#dropping columns which are more than 50% empty
trainmovies.drop(['rating', 'ratingContents', 'releaseDateTheaters', 'boxOffice', 'distributor', 'soundType'],axis=1,inplace=True);
testmovies.drop(['rating', 'ratingContents', 'releaseDateTheaters', 'boxOffice', 'distributor', 'soundType'],axis=1,inplace=True);

In [None]:
trainmovies.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder # categorical to numerical
from sklearn.preprocessing import LabelEncoder # for y only
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile,chi2




In [None]:
trainmovies["reviewText"].fillna(value="",inplace=True)
testmovies["reviewText"].fillna(value="",inplace=True)

trainmovies["genre"].fillna(value="",inplace=True)
testmovies["genre"].fillna(value="",inplace=True)

trainmovies["originalLanguage"].fillna(value="",inplace=True)
testmovies["originalLanguage"].fillna(value="",inplace=True)





In [None]:
X=trainmovies.drop("sentiment",axis=1)
y=trainmovies['sentiment']

In [None]:
X.columns

In [None]:
num_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy="median")),('scaler',MaxAbsScaler())])
cat_pipeline=Pipeline(steps=[("encoder",OneHotEncoder())])
text_pipeline=Pipeline(steps=[("vectorizer",TfidfVectorizer())])





In [None]:
transformation_pipeline=ColumnTransformer( [("num_transform",num_pipeline,["audienceScore",'runtimeMinutes']),
                                   ("cat_tranform",OneHotEncoder(),['isFrequentReviewer','originalLanguage'])
                                     ,("text_transform",TfidfVectorizer(),"reviewText")]
                                    ,

                                  remainder="drop",verbose_feature_names_out=True)


In [None]:
X=transformation_pipeline.fit_transform(X)
testmovies=transformation_pipeline.transform(testmovies)

In [None]:
y_encoder=LabelEncoder()
y=y_encoder.fit_transform(y)


In [None]:
#feature selection
sel=SelectPercentile(chi2,percentile=70)
X=sel.fit_transform(X,y)
testmovies=sel.transform(testmovies)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

# MODELS

## logisticregression

In [None]:
from sklearn.linear_model import LogisticRegression
log_regressor=LogisticRegression(solver="liblinear")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [None]:
parameters = {'solver':('liblinear', 'newton-cg')}

In [None]:
log_regressorclf = GridSearchCV(log_regressor, parameters)
log_regressorclf.fit(X_train,y_train)

In [None]:
log_regressorclf.best_estimator_

In [None]:
y_pred1=log_regressorclf.predict(X_test)

In [None]:
f1_score(y_test,y_pred1)

## complement naive bayes

In [None]:
from sklearn.naive_bayes import ComplementNB

In [None]:
c_nb=ComplementNB()
c_nb.fit(X_train,y_train)

In [None]:
y_pred4=c_nb.predict(X_test)

In [None]:
f1_score(y_test,y_pred4)

## decisiontree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_clf=DecisionTreeClassifier()

In [None]:
tree_clf.fit(X_train,y_train)

In [None]:
y_pred3=tree_clf.predict(X_test)

In [None]:
f1_score(y_test,y_pred3)

## Hyper Parameter Tuning on decision tree

In [None]:
from scipy.stats import uniform, poisson
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters = {'criterion':['gini','entropy'],
              'max_depth':poisson(mu=2,loc=2)}

In [None]:
tree_cv = RandomizedSearchCV(DecisionTreeClassifier(), parameters)

In [None]:
tree_cv.fit(X_train,y_train)

In [None]:
y_pred5=tree_cv.predict(X_test)
f1_score(y_test,y_pred5)

## multinomialNB


In [None]:
from sklearn.naive_bayes import MultinomialNB
M_nb=MultinomialNB()

In [None]:
M_nb.fit(X_train,y_train)

In [None]:
y_pred2=M_nb.predict(X_test)

In [None]:
f1_score(y_test,y_pred2)

## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
xgb_clf=XGBClassifier(booster='gbtree')

xgb_clf.fit(X_train,y_train)

pred=xgb_clf.predict(X_test)
f1_score(y_test,pred)

# Addressing class imbalance

In [None]:
from imblearn.over_sampling import RandomOverSampler
sm = RandomOverSampler()

X_res, y_res = sm.fit_resample(X, y)

X_res_train,X_res_test,y_res_train,y_res_test=train_test_split(X_res,y_res,test_size=0.2)

logisticregression after resampling

In [None]:
log_reg=LogisticRegression(solver="liblinear")
log_reg.fit(X_res_train,y_res_train)


pred_res=log_reg.predict(X_res_test)
f1_score(y_res_test,pred_res)

complement Naive Bayes after resampling

In [None]:
c_nb_res=ComplementNB()
c_nb_res.fit(X_res_train,y_res_train)

pred_res1=c_nb_res.predict(X_res_test)
f1_score(y_res_test,pred_res1)

**f1_score**

---


*  LogisticRegression after HPT -> 0.8669866526398843

*  ComplementNaiveBayes -> 0.859759168536239
*  DecisionTree after HPT -> 0.805924122540069
* MultinomialNaiveBayes -> 0.84990769724697
* XGBClassifier -> 0.8422699594046007

**f1_score of top models after resampling**

---
* LogisticRegression -> 0.81677112000185

* ComplementNaiveBayes -> 0.8031896290424119







# submission


In [None]:

y_predict=pd.DataFrame(y_encoder.inverse_transform(log_regressorclf.predict(testmovies)))
# y_predict=pd.DataFrame(y_encoder.inverse_transform(c_nb.predict(testmovies)))


In [None]:
y_predict.columns=["sentiment"]
y_predict.index.name="id"


In [None]:
y_predict

In [None]:

y_predict.to_csv("submission.csv")