In [59]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from autocorrect import Speller
from wordcloud import WordCloud
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

### First things first, let's look a little bit at the data

In [60]:
test = pd.read_csv('test.txt', sep = ';',header = None )
test.head()
train = pd.read_csv('train.txt', sep = ';',header = None )
train.head()

Unnamed: 0,0,1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [61]:
train[1].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: 1, dtype: int64

In [62]:
test[1].value_counts()

joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: 1, dtype: int64

The data is already separated into a test file and a training file so we don't need to do a train test split. As we can see the data isn't really made equal.

In [67]:
train.rename(columns = {1:'labels', 0:'sentences'}, inplace = True)
test.rename(columns = {1:'labels', 0:'sentences'}, inplace = True)

# Preprocessing the data

- Remove characters that are not letters
- apply .lower() to every word
- tokenization
- remove the stop words
- stemming 
- Correct the spelling

Removing non common characters. 

In [68]:
train.sentences = train.sentences.apply(lambda x: re.sub('[^A-Za-z]',' ', x))
test.sentences = test.sentences.apply(lambda x: re.sub('[^A-Za-z]',' ', x))

Apply .lower()

In [69]:
train.sentences = train.sentences.str.lower()
test.sentences = test.sentences.str.lower()

Tokenization

In [70]:
train.sentences = train.sentences.apply(lambda x: word_tokenize(x))
test.sentences = test.sentences.apply(lambda x: word_tokenize(x))

Removing stop words, correct spelling and stemming

In [71]:
spell_correcter = Speller()
stemmer  = PorterStemmer()

def remover_correct_stemm(message_columns: pd.Series):
    list_of_emails = []
    for row in tqdm(message_columns):
        lst = []
        for word in row:
            #remove the words that are stopwords
            if word not in stopwords.words('english'):
                #corrects the spelling
                corrected_word = spell_correcter(word)
                #stems each word
                word_stemm = stemmer.stem(corrected_word)
                #makes a new sentence without the stop words, with the word stemmed and without spelling errors
                lst.append(word_stemm)
        #appends to the big list
        lst = ' '.join(lst)
        list_of_emails.append(lst)
    return list_of_emails

In [72]:
train.sentences =remover_correct_stemm(train.sentences)
test.sentences =remover_correct_stemm(test.sentences)

100%|██████████| 16000/16000 [16:26<00:00, 16.23it/s] 
100%|██████████| 2000/2000 [01:23<00:00, 23.81it/s]


In [73]:
train.sentences

0                                        didnt feel humili
1        go feel hopeless damn hope around someon care ...
2                     im grab minut post feel greedi wrong
3           ever feel nostalg fireplac know still properti
4                                             feel grouchi
                               ...                        
15995        brief time beanbag said anna feel like beaten
15996     turn feel pathet still wait tabl dub teach degre
15997                              feel strong good overal
15998                       feel like rude comment im glad
15999                         know lot feel stupid portray
Name: sentences, Length: 16000, dtype: object

In [75]:
from sklearn.naive_bayes import MultinomialNB
counter = CountVectorizer()
x_train = counter.fit_transform(train.sentences).toarray()
x_test = counter.transform(test.sentences).toarray()
model = MultinomialNB()
model.fit(x_train, train.labels)

MultinomialNB()

In [76]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test['labels'], model.predict(x_test)))
print('Accuracy:\n', accuracy_score(test['labels'], model.predict(x_test)))

              precision    recall  f1-score   support

       anger       0.84      0.66      0.74       275
        fear       0.79      0.59      0.68       224
         joy       0.76      0.93      0.84       695
        love       0.74      0.33      0.45       159
     sadness       0.75      0.89      0.82       581
    surprise       0.80      0.06      0.11        66

    accuracy                           0.77      2000
   macro avg       0.78      0.58      0.61      2000
weighted avg       0.77      0.77      0.75      2000

Accuracy:
 0.768
