In [21]:
# Importing packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud    #library to visualize text data
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer   #Transform text to vectors.
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,classification_report,precision_score,recall_score

In [2]:
# Reading dataset
df_train = pd.read_csv('../input/emotions-dataset-for-nlp/train.txt', delimiter = ';', names = ['text', 'label'])
df_val = pd.read_csv('../input/emotions-dataset-for-nlp/val.txt', delimiter = ';', names = ['text', 'label'])

print(df_train.shape)
print(df_val.shape)

In [3]:
df = pd.concat([df_train,df_val])    #Merging two datasets
df.reset_index(inplace = True, drop = True)

print(df.shape) 
df.sample(5)   #Selects random sample

In [4]:
sns.countplot(df.label)

## <---> love, surprise, joy = positive statement
## <---> sadness, anger, fear = negative statement

In [5]:
#converting labels to manual encoder

def manual_encoder(df):
    df.replace(to_replace = "surprise", value = 1, inplace = True)
    df.replace(to_replace = "love", value = 1, inplace = True)
    df.replace(to_replace = "joy", value = 1, inplace = True)
    df.replace(to_replace = "sadness", value = 0, inplace = True)
    df.replace(to_replace = "anger", value = 0, inplace = True)
    df.replace(to_replace = "fear", value = 0, inplace = True)

In [6]:
manual_encoder(df['label'])

In [7]:
df.sample(5)

In [8]:
sns.countplot(df.label)

# Data Preprocessing

In [9]:
lm = WordNetLemmatizer()

In [10]:
def data_prep(df):
    corpus = []
    for i in df:
        item = re.sub("[^A-Za-z]"," ",str(i))
        item = item.lower() 
        item = item.split()
        item = [lm.lemmatize(word) for word in item if word not in set(stopwords.words('english'))]
        corpus.append(' '.join(str(x) for x in item))
    return corpus
    
    

In [11]:
corpus = data_prep(df['text'])
corpus[:10]

plt.figure(figsize=(20,8))
word_cloud=""
for rows in corpus:
    for words in corpus:
        word_cloud += " ".join(words)
wordcloud = WordCloud().generate(word_cloud)
plt.imshow(wordcloud)

## Bag of words

In [13]:
cv = CountVectorizer(ngram_range=(1,2))
traindata = cv.fit_transform(corpus)
x = traindata
y = df['label']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [23]:
nv = MultinomialNB()

In [24]:
nv.fit(x_train, y_train)

In [25]:
ypred = nv.predict(x_test)

In [36]:
print('###### confusion matrix ######\n')
print(confusion_matrix(y_test, ypred))
print('\n###### classification report ######\n')
print(classification_report(y_test, ypred))
print('\n accuracy score :', accuracy_score(y_test, ypred))
print('\n precision score :', precision_score(y_test, ypred))
print('\n recall score :', recall_score(y_test, ypred))

In [39]:
df_test=pd.read_csv('../input/emotions-dataset-for-nlp/test.txt', delimiter=';', names = ['text','label'])
df_test.sample(4)

In [41]:
manual_encoder(df_test['label'])
corpus = data_prep(df_test['text'])
testdata = cv.transform(corpus)
pred = nv.predict(testdata)

In [42]:
print('###### confusion matrix ######\n')
print(confusion_matrix(df_test['label'], pred))
print('\n###### classification report ######\n')
print(classification_report(df_test['label'], pred))
print('\n accuracy score :', accuracy_score(df_test['label'], pred))
print('\n precision score :', precision_score(df_test['label'], pred))
print('\n recall score :', recall_score(df_test['label'], pred))