In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

<br/>
Uncomment the following cell to always display the whole dataframe, but this makes my laptop run slowly.

In [None]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False) 
# pd.set_option('display.max_colwidth', None)

# Data cleaning
Read training, test, evaluation sets, check whether there exist nulls or abnormal values. It seems that the given datasets are very clean.

In [2]:
training_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
evaluation_set = pd.read_csv('evaluation.csv')

print(training_set['score'].value_counts(), '\n', # This is to count all different values in 'score'.
      training_set.isna().sum())                  # This is to count all nulls.
print(' ')
print(test_set['score'].value_counts(), '\n',
      test_set.isna().sum())
print(' ')
print(evaluation_set['score'].value_counts(), '\n',
      evaluation_set.isna().sum())

1    3752
0    3748
Name: score, dtype: int64 
 score    0
text     0
dtype: int64
 
0    1252
1    1248
Name: score, dtype: int64 
 score    0
text     0
dtype: int64
 
1    2518
0    2482
Name: score, dtype: int64 
 score    0
text     0
dtype: int64



<br/>
Defining a function for cleaning the 'text' feature.


In [3]:
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)                           # substitute HTML tags with spaces
    text = re.sub(r'@[\w_]+', ' ', text)                         # substitute mentions with spaces
    text = re.sub(r'http[s]?://\S+|www\.\S+', ' ', text)         # substitute URLs with spaces
    text = re.sub(r'[^\w\s\d\U0001F000-\U0001F9FF]', ' ', text)  # substitute punctuations with spaces
    return text

<br/>
Apply clean_text function to all datasets.

In [4]:
training_set['text'] = training_set['text'].apply(clean_text)
test_set['text'] = test_set['text'].apply(clean_text)
evaluation_set['text'] = evaluation_set['text'].apply(clean_text)

# Feature engineering
Use TF-IDF to encode the 'text' features, with 6000 as max features, 'english' library as stop words, and consider both 1 and 2 grams. The encoding method is then fit for the training set and saved in tfidf_vectorizer. After that, use the same tfidf_vectorizer (the same encoding method) to encode the test and evaluation set.

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=6000, stop_words='english', ngram_range=(1, 2))

X_train = tfidf_vectorizer.fit_transform(training_set['text'])
y_train = training_set['score']

X_test = tfidf_vectorizer.transform(test_set['text'])
y_test = test_set['score']

X_evaluation = tfidf_vectorizer.transform(evaluation_set['text'])
y_evaluation = evaluation_set['score']

# Training
Use MultnomialNB as the naive bayes classifier. Apply grid search with 5-fold cross validation to find the best smoothing paramter alpha for MultnomialNB. It turns out that the best alpha is 2.9.

In [6]:
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0, 2.8, 2.9, 3.0, 4.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']
best_alpha

2.9

<br/>
Then train MultinomialNB with the best alpha 2.9, and save the model as nb_classifier.

In [7]:
nb_classifier = MultinomialNB(alpha=best_alpha)
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=2.9)

# Testing and evaluation
Use nb_classifier to predict for test and evaluation set, and check the accuracy.

In [8]:
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8432


In [9]:
y_pred_evaluation = nb_classifier.predict(X_evaluation)

accuracy_evaluation = accuracy_score(y_evaluation, y_pred_evaluation)
print(accuracy_evaluation)

0.8388
