In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('IMDB Dataset.csv')
print(data.columns)

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


def preprocess_text(text):

    tokens = word_tokenize(text.lower())
    
    filtered_tokens = [ps.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    
    return ' '.join(filtered_tokens)


data['review'].fillna('', inplace=True)

data['processed_text'] = data['review'].apply(preprocess_text)

data.head()


Index(['review', 'sentiment'], dtype='object')
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      processed_text  
0  one review mention watch oz episod hook right ...  
1  wonder littl product br br film techniqu fashi...  
2  thought wonder way spend time hot summer weeke...  
3  basic famili littl boy jake think zombi closet...  
4  petter mattei love time money visual stun film...  


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['processed_text'])

#I tried also Countvectorizer but the best f1 score was calculated on tfidvectorizer.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [4]:
# predictions

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#F1 Score
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Accuracy: 0.8875
Precision: 0.8878457054387958
Recall: 0.8875
F1-score: 0.8874593485557376
