**TF-IDF: Exercises**
    
- Humans 👦 show different emotions/feelings based on the situations and communicate them through facial expressions or in form of words.

- In Social Media like Twitter and Instagram, many people express their views through comments about a particular event/scenario and these comments may address the feelings like sadness, happiness, joy, sarcasm, fear, and many other.

- For a given comment/text, we are going to use classical NLP techniques and classify under which emotion that particular comment belongs!

- We are going to use techniques like Bag of grams, n-grams, TF-IDF, etc. for text representation and apply different classification algorithms.

In [1]:
import pandas as pd
df = pd.read_csv('Emotion_classify_Data.csv')
print(df.shape)
df.head()

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [2]:
df['Emotion'].value_counts()

joy      2000
anger    2000
fear     1937
Name: Emotion, dtype: int64

In [3]:
df['Emotion_num'] = df['Emotion'].map({
    'joy' : 0,
    'anger' : 1,
    'fear' : 2
})

df.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,1
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,2


### Modelling Without Preprocessed Data

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df['Emotion_num'], test_size=0.2,
                                                   random_state=2022, stratify=df['Emotion_num'])

In [5]:
len(X_train)

4749

In [6]:
len(X_test)

1188

In [7]:
y_train.value_counts()

0    1600
1    1600
2    1549
Name: Emotion_num, dtype: int64

In [8]:
y_test.value_counts()

0    400
1    400
2    388
Name: Emotion_num, dtype: int64

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('random', RandomForestClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92       400
           1       0.93      0.90      0.91       400
           2       0.92      0.92      0.92       388

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer


clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('random', RandomForestClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92       400
           1       0.93      0.89      0.91       400
           2       0.93      0.91      0.92       388

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [11]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89       400
           1       0.88      0.92      0.89       400
           2       0.88      0.87      0.88       388

    accuracy                           0.89      1188
   macro avg       0.89      0.89      0.89      1188
weighted avg       0.89      0.89      0.89      1188



In [12]:
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90       400
           1       0.88      0.92      0.90       400
           2       0.91      0.87      0.89       388

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



### Modelling with Preprocessed Data

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [14]:
df['Preprocessed_comment'] = df['Comment'].apply(preprocess)

In [15]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num,Preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,1,m life feel appal
2,i sit here to write i start to dig out my feel...,fear,2,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,2,feel suspicious outside like rapture happen


In [16]:
df['Comment'][0]

'i seriously hate one subject to death but now i feel reluctant to drop it'

In [17]:
df['Preprocessed_comment'][0]

'seriously hate subject death feel reluctant drop'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed_comment'], df['Emotion_num'], test_size=0.2,
                                                   random_state=2022, stratify=df['Emotion_num'])

In [19]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('random', RandomForestClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       400
           1       0.92      0.94      0.93       400
           2       0.94      0.90      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [20]:
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('random', RandomForestClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       400
           1       0.93      0.90      0.92       400
           2       0.92      0.93      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [21]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       400
           1       0.89      0.91      0.90       400
           2       0.88      0.91      0.89       388

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [22]:
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       400
           1       0.90      0.91      0.91       400
           2       0.90      0.91      0.90       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188

