**1.import required libraries**

In [None]:
import numpy as np
import pandas as pd

**2.Read the data set**

In [None]:
data=pd.read_csv('/content/tweet_emotions.csv')

**3.Understand the dataset**

In [None]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
data.shape

(40000, 3)

In [None]:
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')

**4.find the missing values**

In [None]:
data.isna().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

**5.Drop id column**

In [None]:
data=data.drop('tweet_id',axis=1)

In [None]:
data.columns

Index(['sentiment', 'content'], dtype='object')

**6.target column**

In [None]:
x=data.drop('sentiment',axis=1)
y=data['sentiment']

In [None]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
data['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

**7.cleaning the dataset**

In [None]:
def custom_encoder(df):
  df.replace(to_replace=['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise'],value =1 ,inplace = True)
  df.replace(to_replace=['love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger' ],value =0 ,inplace = True)
  return df

In [None]:
data['sentiment'] = custom_encoder(data['sentiment'])

In [None]:
data.head()

Unnamed: 0,sentiment,content
0,1,@tiffanylue i know i was listenin to bad habi...
1,1,Layin n bed with a headache ughhhh...waitin o...
2,1,Funeral ceremony...gloomy friday...
3,1,wants to hang out with friends SOON!
4,1,@dannycastillo We want to trade with someone w...


In [None]:
data.sentiment.value_counts(normalize=True)

sentiment
1    0.650875
0    0.349125
Name: proportion, dtype: float64

**8.Data Processing**

In [None]:
import string
def remove_punctuations(text):
  punc_free = ''.join([i for i in text if i not in string.punctuation])
  return punc_free

In [None]:
import nltk
nltk.download('punkt')
def tokenize(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
nltk.download('stopwords')
sw = nltk.corpus.stopwords.words('english')
def remove_sw(text):
  output = [i for i in text if i not in sw]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
word_lem = WordNetLemmatizer()
def lemm(text):
  lemm_text = [word_lem.lemmatize(word) for word in text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def preprocess(df_col):
  corpus=[]
  for item in df_col:
    new_item = remove_punctuations(item)
    new_item = new_item.lower()
    new_item = tokenize(new_item)
    new_item = remove_sw(new_item)
    new_item = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [None]:
corpus = preprocess(data['content'])

In [None]:
corpus[0:10]

['tiffanylue know listenin bad habit earlier started freakin part',
 'layin n bed headache ughhhhwaitin call',
 'funeral ceremonygloomy friday',
 'want hang friend soon',
 'dannycastillo want trade someone houston ticket one',
 'repinging ghostridah14 didnt go prom bc bf didnt like friend',
 'sleep im thinking old friend want he married damn amp want 2 scandalous',
 'hmmm httpwwwdjherocom',
 'charviray charlene love miss',
 'kelcouch im sorry least friday']

**9.Bag of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
vec_data = cv.fit_transform(corpus)
x = vec_data
y = data['sentiment']

In [None]:
x

<40000x253419 sparse matrix of type '<class 'numpy.int64'>'
	with 570570 stored elements in Compressed Sparse Row format>

In [None]:
y

0        1
1        1
2        1
3        1
4        1
        ..
39995    1
39996    0
39997    0
39998    0
39999    0
Name: sentiment, Length: 40000, dtype: int64

**10.Import RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(x,y)

**11.Import AccuracyScore**

In [None]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x)
accuracy_score(y,y_pred)

0.9972

**12.Testing**

In [None]:
test_df = pd.read_csv('/content/tweet_emotions.csv', delimiter=',', names=['content', 'sentiment'], on_bad_lines='skip')

In [None]:
test_df.head()

Unnamed: 0,content,sentiment
tweet_id,sentiment,content
1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
1956967696,sadness,Funeral ceremony...gloomy friday...
1956967789,enthusiasm,wants to hang out with friends SOON!


In [None]:
x_test = test_df['content']
y_test = test_df['sentiment']

In [None]:
y_test = custom_encoder(y_test)

In [None]:
y_test

tweet_id                                                content
1956967341    @tiffanylue i know  i was listenin to bad habi...
1956967666    Layin n bed with a headache  ughhhh...waitin o...
1956967696                  Funeral ceremony...gloomy friday...
1956967789                 wants to hang out with friends SOON!
                                    ...                        
1753918954                                     @JohnLloydTaylor
1753919001                       Happy Mothers Day  All my love
1753919005    Happy Mother's Day to all the mommies out ther...
1753919043    @niariley WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEE...
1753919049    @mopedronin bullet train from tokyo    the gf ...
Name: sentiment, Length: 40001, dtype: object

In [None]:
x_test = preprocess(x_test)

In [None]:
x_test[0:5]

['sentiment', 'empty', 'sadness', 'sadness', 'enthusiasm']

In [None]:
x_test = cv.transform(x_test)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

0.0