In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
# always display the whole dataframe
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False) 
# pd.set_option('display.max_colwidth', None)

In [3]:
training_set = pd.read_csv('train.csv')

In [4]:
training_set

Unnamed: 0,score,text
0,0,"overgeneralized, not helpful to anyone serious..."
1,1,Great sound and service.
2,1,love this book!!!: this book is a fast read ab...
3,1,A hugely enjoyable screen version of Rona Jaff...
4,0,What an uninteresting hodge-podge. It could ha...
...,...,...
7495,1,"@USAirways YOU ARE AMAZING!!! FOLLOW ME BACK, ..."
7496,0,"@JetBlue we're home, you guys recovered, now w..."
7497,1,pays for itself in 0 months: i was paying for ...
7498,1,@AmericanAir continues to win: I've never miss...


In [5]:
training_set['score'].value_counts()

1    3752
0    3748
Name: score, dtype: int64

In [6]:
training_set.isna().sum()

score    0
text     0
dtype: int64

In [7]:
# data cleaning
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # substitute HTML tags with spaces
    text = re.sub(r'@[\w_]+', ' ', text)  # substitute mentions with spaces
    text = re.sub(r'http[s]?://\S+|www\.\S+', ' ', text)  # substitute URLs with spaces
    text = re.sub(r'[^\w\s\d\U0001F000-\U0001F9FF]', ' ', text)  # substitute punctuations with spaces
    return text

In [8]:
training_set['text'] = training_set['text'].apply(clean_text)

In [9]:
training_set.loc[0]

score                                                    0
text     overgeneralized  not helpful to anyone serious...
Name: 0, dtype: object

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=6000, stop_words='english', ngram_range=(1, 2))

In [11]:
X_train = tfidf_vectorizer.fit_transform(training_set['text'])
y_train = training_set['score']

In [12]:
# apply grid search
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']

In [13]:
nb_classifier = MultinomialNB(alpha=best_alpha)
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=2.0)

In [14]:
test_set = pd.read_csv('test.csv')

In [15]:
test_set['score'].value_counts()

0    1252
1    1248
Name: score, dtype: int64

In [16]:
test_set.isna().sum()

score    0
text     0
dtype: int64

In [17]:
test_set['text'] = test_set['text'].apply(clean_text)

In [18]:
X_test = tfidf_vectorizer.transform(test_set['text'])
y_test = test_set['score']

In [19]:
y_pred = nb_classifier.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(classification_report(y_test, y_pred))

0.8432
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      1252
           1       0.85      0.83      0.84      1248

    accuracy                           0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.84      0.84      0.84      2500



In [21]:
evaluation_set = pd.read_csv('evaluation.csv')

In [22]:
evaluation_set['score'].value_counts()

1    2518
0    2482
Name: score, dtype: int64

In [23]:
evaluation_set.isna().sum()

score    0
text     0
dtype: int64

In [24]:
evaluation_set['text'] = evaluation_set['text'].apply(clean_text)

In [25]:
X_evaluation = tfidf_vectorizer.transform(evaluation_set['text'])
y_evaluation = evaluation_set['score']

In [26]:
y_pred_evaluation = nb_classifier.predict(X_evaluation)

In [27]:
accuracy_evaluation = accuracy_score(y_evaluation, y_pred_evaluation)
print(accuracy_evaluation)
print(classification_report(y_evaluation, y_pred_evaluation))

0.839
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      2482
           1       0.86      0.81      0.84      2518

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.84      0.84      0.84      5000

