# imports

In [1]:
import pandas as pd 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer


# Loading and cleaning

In [2]:
df=pd.read_csv('eng_dataset.csv')
copy=df.copy()
df.tail()

Unnamed: 0,ID,sentiment,content
7097,40781,sadness,@VivienLloyd Thank you so much! Just home - st...
7098,40782,sadness,Just put the winter duvet on ☃️❄️🌬☔️
7099,40783,sadness,@SilkInSide @TommyJoeRatliff that's so pretty!...
7100,40784,sadness,@BluesfestByron second artist announcement loo...
7101,40785,sadness,I can literally eat creamy pesto pasta topped ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7102 entries, 0 to 7101
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         7102 non-null   int64 
 1   sentiment  7102 non-null   object
 2   content    7102 non-null   object
dtypes: int64(1), object(2)
memory usage: 166.6+ KB


In [4]:
df['sentiment'].value_counts()

fear       2252
anger      1701
joy        1616
sadness    1533
Name: sentiment, dtype: int64

In [5]:
data=df.iloc[:,1:3]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7102 entries, 0 to 7101
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  7102 non-null   object
 1   content    7102 non-null   object
dtypes: object(2)
memory usage: 111.1+ KB


# Cleaning and preprocessing

In [6]:
data["content"]=data["content"].apply(lambda content:" ".join(word.lower() for word in content.split()) )
data.tail()

Unnamed: 0,sentiment,content
7097,sadness,@vivienlloyd thank you so much! just home - st...
7098,sadness,just put the winter duvet on ☃️❄️🌬☔️
7099,sadness,@silkinside @tommyjoeratliff that's so pretty!...
7100,sadness,@bluesfestbyron second artist announcement loo...
7101,sadness,i can literally eat creamy pesto pasta topped ...


In [7]:
data["content"]=data["content"].apply(lambda content: re.sub(r"[^\w\s]","",content))
data.tail()

Unnamed: 0,sentiment,content
7097,sadness,vivienlloyd thank you so much just home stunn...
7098,sadness,just put the winter duvet on
7099,sadness,silkinside tommyjoeratliff thats so pretty i l...
7100,sadness,bluesfestbyron second artist announcement look...
7101,sadness,i can literally eat creamy pesto pasta topped ...


def remove_stop_words(content):
    without_stopwords=[]
    tokenized=word_tokenize(content)
    for word in tokenized:
        if word not in stop:
            without_stopwords.append(str(TextBlob(word).correct()))
    return " ".join(without_stopwords)

In [8]:
stop=stopwords.words("english")
data["content"]=data["content"].apply(lambda content: " ".join(word for word in word_tokenize(content) if word not in stop))
data.tail()

Unnamed: 0,sentiment,content
7097,sadness,vivienlloyd thank much home stunned happy dont...
7098,sadness,put winter duvet
7099,sadness,silkinside tommyjoeratliff thats pretty love s...
7100,sadness,bluesfestbyron second artist announcement look...
7101,sadness,literally eat creamy pesto pasta topped grille...


In [9]:
ps=PorterStemmer()
data["content"]=data["content"].apply(lambda content: ' '.join([ps.stem(word) for word in word_tokenize(content)]))
data["content"]=data["content"].apply(lambda content: str(TextBlob(content).correct()))
data.tail()

Unnamed: 0,sentiment,content
7097,sadness,vivienlloyd thank much home sun happy dont thi...
7098,sadness,put winter due
7099,sadness,silkinsid tommyjoeratliff that pretty love sky...
7100,sadness,bluesfestbyron second artist announce look goo...
7101,sadness,later eat cream pest past top drill chicken su...


# Label Encoding

In [10]:
data['sentiment'].value_counts()

fear       2252
anger      1701
joy        1616
sadness    1533
Name: sentiment, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['sentiment_label']=le.fit_transform(data['sentiment'])
data['sentiment_label'].value_counts()

1    2252
0    1701
2    1616
3    1533
Name: sentiment_label, dtype: int64

# Train split test

In [12]:
from sklearn.model_selection import train_test_split
X=data['content']
y=data['sentiment_label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,stratify=y)

# Feature engineering

## 1)Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow1=cv.fit_transform(X_train)
print(bow1.shape)
bow1_test=cv.transform(X_test)
print(bow1_test.shape)

(5326, 9538)
(1776, 9538)


## 2)TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bow2=tfidf.fit_transform(X_train)
bow2_test=tfidf.transform(X_test)

# Model building

## 1)Naive bayes

In [15]:
from sklearn.metrics import accuracy_score
def build_model(model,bow_train,y_train,bow_test):
    model.fit(bow_train,y_train)
    pred=model.predict(bow_test)
    return accuracy_score(y_test,pred)
    

In [16]:
from sklearn.naive_bayes import MultinomialNB
print("accuracy using count vectorizer: ", build_model(MultinomialNB(),bow1,y_train,bow1_test))
print("accuracy using count tfidf: ", build_model(MultinomialNB(),bow2,y_train,bow2_test))


accuracy using count vectorizer:  0.8057432432432432
accuracy using count tfidf:  0.740990990990991


In [17]:
# tweet=["i am having fun "]
# nb=MultinomialNB()
# nb.fit(bow1,y_train)
# test=cv.transform(tweet)
# prediction=nb.predict(test)[0]
# if (prediction==0):
#     print('fear')
# if (prediction==1):
#     print('anger')
# if (prediction==2):
#     print('joy')
# if (prediction==3):
#     print('sadness')

## 2) Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression
print("accuracy using count vectorizer: ", build_model(LogisticRegression(),bow1,y_train,bow1_test))
print("accuracy using count tfidf: ", build_model(LogisticRegression(),bow2,y_train,bow2_test))

accuracy using count vectorizer:  0.8344594594594594
accuracy using count tfidf:  0.8400900900900901


In [20]:
tweet=["i am having fun "]
lr=LogisticRegression()
lr.fit(bow1,y_train)
test=cv.transform(tweet)
prediction=lr.predict(test)[0]
if (prediction==0):
    print('fear')
if (prediction==1):
    print('anger')
if (prediction==2):
    print('joy')
if (prediction==3):
    print('sadness')

anger
