In [1]:
import numpy as np
import pandas as pd

Loading dataset

In [2]:
data = pd.read_csv('/content/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
data.shape

(40000, 3)

Data Preprocessing

In [4]:
data['tweet_id'].nunique()

40000

In [5]:
data.drop('tweet_id', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
data['sentiment'].nunique()

13

In [8]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [9]:
data.isna().sum()      #missing value check

sentiment    0
content      0
dtype: int64

In [10]:
data['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label =  LabelEncoder()

In [13]:
data['sentiment'] = label.fit_transform(data['sentiment'])

In [14]:
data.head()

Unnamed: 0,sentiment,content
0,2,@tiffanylue i know i was listenin to bad habi...
1,10,Layin n bed with a headache ughhhh...waitin o...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,@dannycastillo We want to trade with someone w...


Text preprocessing

In [15]:
import string
def remove_punctuations(text):
  punc_free = ''.join([i for i in text if i not in string.punctuation])
  return punc_free

In [16]:
import nltk
nltk.download('punkt')
def tokenize(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
#stopwords removal
nltk.download('stopwords')
sw = nltk.corpus.stopwords.words('english')
def remove_sw(text):
  output = [i for i in text if i not in sw]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#Lemmantization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
word_lem = WordNetLemmatizer()
def lemm(text):
  lemm_text = [word_lem.lemmatize(word) for word in text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def preprocess(df_col):
  corpus=[]
  for item in df_col:
    new_item = remove_punctuations(item)
    new_item = new_item.lower()
    new_item = tokenize(new_item)
    new_item = remove_sw(new_item)
    new_item = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [20]:
corpus = preprocess(data['content'])

In [21]:
corpus[0:10]

['tiffanylue know listenin bad habit earlier started freakin part',
 'layin n bed headache ughhhhwaitin call',
 'funeral ceremonygloomy friday',
 'want hang friend soon',
 'dannycastillo want trade someone houston ticket one',
 'repinging ghostridah14 didnt go prom bc bf didnt like friend',
 'sleep im thinking old friend want he married damn amp want 2 scandalous',
 'hmmm httpwwwdjherocom',
 'charviray charlene love miss',
 'kelcouch im sorry least friday']

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
vec_data = cv.fit_transform(corpus)
x = vec_data
y = data['sentiment']

Model Building and Evaluation

In [23]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 10)
clf.fit(x,y)

In [27]:

from sklearn.metrics import accuracy_score
y_pred = clf.predict(x)
accuracy_score(y,y_pred)

0.9741