<a href="https://colab.research.google.com/github/KareemKhaledd/text_emotion_detection/blob/main/Emotion_Detection_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text-Based emotion detection 

In [None]:
!pip install neattext
!pip install text_hammer 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load EDA Pkgs
import pandas as pd
import numpy as np
import seaborn as sns
import neattext.functions as nfx
# Load ML Pkgs
 
# Estimators
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Transformers => vectorization 
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer 
 
# metrics 
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# to split the dataset to training and testing dataset 
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
import text_hammer as th

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [None]:
# Load Dataset
df = pd.read_csv("/content/drive/MyDrive/emotion_dataset_raw.csv")

In [None]:
df

In [None]:
df.shape

(34792, 2)

In [None]:
# Value Counts
df['Emotion'].value_counts()

In [None]:
df.drop(df[df.Emotion == "shame"].index, inplace=True)
df.drop(df[df.Emotion == "disgust"].index, inplace=True)
df.Emotion=df.Emotion.replace({"joy":0, 'sadness':1, "fear":2, "anger":3, "surprise":4, "neutral":5})

In [None]:
df['Emotion'].value_counts()

In [None]:
from tensorflow.keras.utils import to_categorical
df['Emotion'] = to_categorical(df['Emotion'])

In [None]:
# Plot
sns.countplot(x='Emotion',data=df)

In [None]:
df.isnull().sum()

## Sentiment Analysis 

In [None]:
! pip install textblob
from textblob import TextBlob
def get_sentiment(text):
    blob = TextBlob(text)
    sentiement = blob.sentiment.polarity
    result = ""
    if sentiement > 0 :
        result = "Positive"
    elif sentiement < 0 :
        result = "Negative"
    else :
        result = "Neutral"
    return result 

In [None]:
get_sentiment("i love programming")

In [None]:
df2=df 
df2['Sentiment'] = df2["Text"].apply(get_sentiment)

In [None]:
df2.head()

In [None]:
# let's compare between emotions and sentiments 
df2.groupby(['Emotion','Sentiment']).size()

In [None]:
df2.groupby(['Emotion','Sentiment']).size().plot(kind="bar")

In [None]:
# another way for plotting data 
sns.catplot(x="Emotion",hue="Sentiment",data=df2,kind="count",aspect=1.5)

## clean the Text 

In [None]:
# Data Cleaning
dir(nfx)

In [None]:
# User handles
df2['Clean_Text'] = df2['Text'].apply(nfx.remove_userhandles)

In [None]:
# Stopwords
df2['Clean_Text'] = df2['Clean_Text'].apply(nfx.remove_stopwords)

In [None]:

df2['Clean_Text'] = df2['Clean_Text'].apply(nfx.remove_hashtags)

In [None]:

df2['Clean_Text'] = df2['Clean_Text'].apply(nfx.remove_punctuations)

In [None]:
df2['Clean_Text'].str.replace(")","") 

In [None]:
df2['Clean_Text'].str.replace("(","") 

In [None]:
df2['Clean_Text'].str.replace(":","") 

In [None]:
df2['Clean_Text'].str.replace("'","") 

In [None]:
df2.head()

## Keyword extraction 

- exract the most common word in each class 

In [None]:
from collections import Counter 

In [None]:
def extraxt_keywords(text,num=50):
    tokens = [tok for tok in text.split()]
    most_common_tokens = Counter(tokens).most_common(num)
    return dict(most_common_tokens)

-let's check it , in a simple example on calculating the most common word in "joy" class 


In [None]:
emotion_list = df2['Emotion'].unique().tolist()
print(emotion_list)

[1.0, 0.0]


In [None]:
joy_list = df2[df2['Emotion']==0]['Clean_Text'].tolist()
for i in joy_list:
    print(i)

In [None]:
# make the list as a string so we can calculate the most common word 
joy_docx = ' '.join(joy_list)
print(joy_docx)



In [None]:
# extract the most common words in joy_docx 
key_dict = extraxt_keywords(joy_docx)
key_dict

In [None]:
import matplotlib.pylab as plt
# plot the most common word 
def plot_most_common_words(mydict,emotion_name):
    df_01 = pd.DataFrame(mydict.items(),columns=['taken','count'])
    plt.figure(figsize=(20,10))
    plt.title("Plot of {}".format(emotion_name))
    sns.barplot(x='taken',y='count',data=df_01)
    plt.xticks(rotation=45)
    plt.show()

In [None]:
plot_most_common_words(key_dict,"Joy")

In [None]:
sadness_list = df2[df2['Emotion']==1]['Clean_Text'].tolist() # get the list of emotion_category 
sadness_docx = ' '.join(sadness_list) # make it as one string 
key_dict_sadness = extraxt_keywords(sadness_docx) # count each word 
print(key_dict_sadness)
plot_most_common_words(key_dict_sadness,"sadness") # plot the results 

In [None]:
# word cloud
#!pip install wordcloud
from wordcloud import WordCloud 

In [None]:
def plot_wordcloud(docx):
    myWordCloud = WordCloud().generate(docx)
    plt.figure(figsize=(15,10))
    plt.imshow(myWordCloud,interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
plot_wordcloud(joy_docx)

In [None]:
plot_wordcloud(sadness_docx)

## Machine learning Text classification 

+ SVM 
+ naive bayes 
+ logestic regression 
+ KNN 
+ Descision tree 

+ compare with sparkNLP / NLU john snows lab 

In [None]:
# Features & Labels
Xfeatures = df2['Clean_Text']
ylabels = df2['Emotion']
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(Xfeatures.head())
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(ylabels.head())

machine learning model cann't understand the text so we have to transfer the text to numbers , this process called vectorization 

In [None]:
#vectorization 
cv = CountVectorizer()
x=cv.fit_transform(Xfeatures)

In [None]:
# get features by name 
cv.get_feature_names()

In [None]:
# to Dense Numpy array 
#x.toarray()

In [None]:
#  Split the Dataset
x_train,x_test,y_train,y_test = train_test_split(x,ylabels,test_size=0.3,random_state=42)

## Build our model 


In [None]:
# y_train = to_categorical(df.Sentiment.values)
# y_test = to_categorical(df_test.Sentiment.values)


## SVM

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

In [None]:
np.random.seed(610)

linearr = svm.SVC(kernel='linear', decision_function_shape='ovr').fit(x_train, y_train)
predictionslinn = linearr.predict(x_test)
print("linear: ",accuracy_score(y_test,predictionslinn))

rbff = svm.SVC(kernel='rbf', decision_function_shape='ovr').fit(x_train, y_train)
predictionslinnn = rbff.predict(x_test)
print("rbf: ",accuracy_score(y_test,predictionslinnn))


polyy = svm.SVC(kernel='poly', decision_function_shape='ovr').fit(x_train, y_train)
predictionslinp = polyy.predict(x_test)
print("poly: ",accuracy_score(y_test,predictionslinp))

sigg = svm.SVC(kernel='sigmoid', decision_function_shape='ovr').fit(x_train, y_train)
predictionslins = sigg.predict(x_test)
print("sig: ",accuracy_score(y_test,predictionslins))


linear:  0.7911610930255499
rbf:  0.7979678405839992
poly:  0.7138206569991121
sig:  0.7881029890500147


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier( criterion = 'entropy', random_state = 42)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=42)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:

print("acc: ",accuracy_score(y_test,y_pred))


acc:  0.789286771234093


## Naive Bayes

In [None]:
nv_model = MultinomialNB()
nv_model.fit(x_train,y_train)

MultinomialNB()

In [None]:
# check the accuracy of the model 
# method 1 : 
nv_model.score(x_test,y_test)

0.7911610930255499

## logistic Regression

In [None]:

lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)


LogisticRegression()

In [None]:
# check the accuracy of the linear regression model 
lr_model.score(x_test , y_test)

0.8002367564368156

In [None]:
def predict_emotion(text,model):
    # vectorizing the text that will be an input to the model
    vectorized_text = cv.transform(text).toarray() 
    prediction = model.predict(vectorized_text)
    prediction_probability = model.predict_proba(vectorized_text)
    prediction_percentage_for_all = dict(zip(model.classes_ , prediction_probability[0])) 
    print("Prediction:{}, Prediction score :{}".format(prediction[0],np.max(prediction_probability)))
    #return prediction_percentage_for_all
                                         
sample_test = ["i love artificial intelligence so much "] # text  to test the model with 
predict_emotion(sample_test,nv_model)



Prediction:1.0, Prediction score :0.8645504024219197


In [None]:
# a single prediction using linear regression model 
sample_test2 = ["i love DEBI "] # text  to test the model with 
predict_emotion(sample_test2,lr_model) 

Prediction:1.0, Prediction score :0.6862010432607868


### Save the Model

In [None]:
import joblib 

In [None]:
model_file=open("Text_based_emotion_classifier_nv_model_26_april_2022.pkl","wb")
joblib.dump(lr_model,model_file)
model_file.close()