In [18]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import text

In [2]:
# always display the whole dataframe
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False) 
# pd.set_option('display.max_colwidth', None)

In [3]:
training_set = pd.read_csv('train.csv')

In [4]:
training_set

Unnamed: 0,score,text
0,0,"overgeneralized, not helpful to anyone serious..."
1,1,Great sound and service.
2,1,love this book!!!: this book is a fast read ab...
3,1,A hugely enjoyable screen version of Rona Jaff...
4,0,What an uninteresting hodge-podge. It could ha...
5,1,@USAirways customer service at its best! Rache...
6,0,@VirginAmerica Is it normal to receive no repl...
7,0,Imagine the worst skits from Saturday Night Li...
8,0,This is one of the worst films ever. I like ch...
9,1,@JetBlue flight attendant Wendi on Flt 127 on ...


In [5]:
training_set['score'].value_counts()

1    3752
0    3748
Name: score, dtype: int64

In [6]:
training_set.isna().sum()

score    0
text     0
dtype: int64

In [6]:
# data cleaning
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # substitute HTML tags with spaces
    text = re.sub(r'@[\w_]+', ' ', text)  # substitute mentions with spaces
    text = re.sub(r'http[s]?://\S+|www\.\S+', ' ', text)  # substitute URLs with spaces
    text = re.sub(r'[^\w\s\d\U0001F000-\U0001F9FF]', ' ', text)  # substitute punctuations with spaces
    return text

In [25]:
training_set['text'] = training_set['text'].apply(clean_text)

In [9]:
training_set.head(10)

Unnamed: 0,score,text
0,0,overgeneralized not helpful to anyone serious...
1,1,Great sound and service
2,1,love this book this book is a fast read ab...
3,1,A hugely enjoyable screen version of Rona Jaff...
4,0,What an uninteresting hodge podge It could ha...
5,1,customer service at its best Rachel S too...
6,0,Is it normal to receive no reply from Centra...
7,0,Imagine the worst skits from Saturday Night Li...
8,0,This is one of the worst films ever I like ch...
9,1,flight attendant Wendi on Flt 127 on 2 17 N...


In [8]:
training_set.loc[0]

score                                                    0
text     overgeneralized  not helpful to anyone serious...
Name: 0, dtype: object

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=6000, stop_words='english', ngram_range=(1, 2))

In [11]:
X_train = tfidf_vectorizer.fit_transform(training_set['text'])
y_train = training_set['score']

In [12]:
# apply grid search
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']

In [13]:
nb_classifier = MultinomialNB(alpha=best_alpha)
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=2.0)

In [21]:
test_set = pd.read_csv('test.csv')

In [22]:
test_set['score'].value_counts()

0    1252
1    1248
Name: score, dtype: int64

In [23]:
test_set.isna().sum()

score    0
text     0
dtype: int64

In [24]:
test_set['text'] = test_set['text'].apply(clean_text)

In [18]:
X_test = tfidf_vectorizer.transform(test_set['text'])
y_test = test_set['score']

In [19]:
y_pred = nb_classifier.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(classification_report(y_test, y_pred))

0.8432
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      1252
           1       0.85      0.83      0.84      1248

    accuracy                           0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.84      0.84      0.84      2500



In [11]:
evaluation_set = pd.read_csv('evaluation.csv')

In [14]:
evaluation_set.head(10)

Unnamed: 0,score,text
0,0,Don't eat or drink here. Rooms okay but noisy....
1,1,"***********If you don't appreciate......\""The ..."
2,1,So so happy that I have it a try. Being dairy ...
3,1,"I just moved to the neighborhood, and I LOVE t..."
4,0,I came here yesterday and had great service. S...
5,1,"So this was my first time here, staff off with..."
6,0,Do not come here if you want fast service.\nA ...
7,1,"Pizza is mega rad. I'm a sucker for chewy, bub..."
8,0,I took my cat in to be boarded for a few days ...
9,0,Made reservation for 7:45 and finally sat down...


In [22]:
evaluation_set['score'].value_counts()

1    2518
0    2482
Name: score, dtype: int64

In [23]:
evaluation_set.isna().sum()

score    0
text     0
dtype: int64

In [24]:
evaluation_set['text'] = evaluation_set['text'].apply(clean_text)

In [25]:
X_evaluation = tfidf_vectorizer.transform(evaluation_set['text'])
y_evaluation = evaluation_set['score']

In [26]:
y_pred_evaluation = nb_classifier.predict(X_evaluation)

In [27]:
accuracy_evaluation = accuracy_score(y_evaluation, y_pred_evaluation)
print(accuracy_evaluation)
print(classification_report(y_evaluation, y_pred_evaluation))

0.839
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      2482
           1       0.86      0.81      0.84      2518

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.84      0.84      0.84      5000



In [15]:
#now we do it again, but different parameters for the vectorizer

In [19]:
my_stop_words = text.ENGLISH_STOP_WORDS
my_stop_words
#we can see that it has a lot of words that it excludes. What would happen if we don't use any stop words?

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [26]:
#making a new classifier from the test set:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))   #removed the max features parameter
X_train = tfidf_vectorizer.fit_transform(training_set['text'])
y_train = training_set['score']
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
nb_classifier = MultinomialNB(alpha=best_alpha)
nb_classifier.fit(X_train, y_train)

#testing it on the training dataset
X_test = tfidf_vectorizer.transform(test_set['text'])
y_test = test_set['score']
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(classification_report(y_test, y_pred))

#result: classifier is mostly similar in accuracy. Tiny increase from 0.8432 to 0.8444

0.8444
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1252
           1       0.88      0.80      0.84      1248

   micro avg       0.84      0.84      0.84      2500
   macro avg       0.85      0.84      0.84      2500
weighted avg       0.85      0.84      0.84      2500



In [27]:
#now onto the evaluation set, seeing if there is a change there

In [28]:
evaluation_set = pd.read_csv('evaluation.csv')
evaluation_set['text'] = evaluation_set['text'].apply(clean_text)
X_evaluation = tfidf_vectorizer.transform(evaluation_set['text'])
y_evaluation = evaluation_set['score']
y_pred_evaluation = nb_classifier.predict(X_evaluation)
accuracy_evaluation = accuracy_score(y_evaluation, y_pred_evaluation)
print(accuracy_evaluation)
print(classification_report(y_evaluation, y_pred_evaluation))

#accuracy again has a minor increase from 0.839 to 0.84. Not really notable. 
#Thus, removing of the stop words does not have any influence on the classifier

0.84
              precision    recall  f1-score   support

           0       0.79      0.92      0.85      2482
           1       0.90      0.77      0.83      2518

   micro avg       0.84      0.84      0.84      5000
   macro avg       0.85      0.84      0.84      5000
weighted avg       0.85      0.84      0.84      5000

