# imports

In [1]:
import pandas as pd 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer


# Loading and cleaning

In [2]:
df=pd.read_csv('ISEAR.csv')
df.columns=['sentiment','content','unnamed']
df=df.loc[:,['sentiment','content']]
copy=df.copy()
df.head()

Unnamed: 0,sentiment,content
0,fear,Every time I imagine that someone I love or I ...
1,anger,When I had been obviously unjustly treated and...
2,sadness,When I think about the short time that we live...
3,disgust,At a gathering I found myself involuntarily si...
4,shame,When I realized that I was directing the feeli...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7515 entries, 0 to 7514
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  7515 non-null   object
 1   content    7515 non-null   object
dtypes: object(2)
memory usage: 117.6+ KB


In [4]:
df['sentiment'].value_counts()

joy        1091
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1049
guit          1
Name: sentiment, dtype: int64

In [5]:
df['sentiment']=df['sentiment'].apply(lambda x:'guilt' if x=='guit' else x)
data=df.iloc[:,0:2]
data['sentiment'].value_counts()

joy        1091
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1050
Name: sentiment, dtype: int64

# Cleaning and preprocessing

In [6]:
data["content"]=data["content"].apply(lambda content:" ".join(word.lower() for word in content.split()) )
data.tail()

Unnamed: 0,sentiment,content
7510,shame,two years back someone invited me to be the tu...
7511,shame,i had taken the responsibility to do something...
7512,fear,i was at home and i heard a loud sound of spit...
7513,guilt,i did not do the homework that the teacher had...
7514,fear,i had shouted at my younger brother and he was...


In [7]:
data["content"]=data["content"].apply(lambda content: re.sub(r"[^\w\s]","",content))
data.tail()

Unnamed: 0,sentiment,content
7510,shame,two years back someone invited me to be the tu...
7511,shame,i had taken the responsibility to do something...
7512,fear,i was at home and i heard a loud sound of spit...
7513,guilt,i did not do the homework that the teacher had...
7514,fear,i had shouted at my younger brother and he was...


def remove_stop_words(content):
    without_stopwords=[]
    tokenized=word_tokenize(content)
    for word in tokenized:
        if word not in stop:
            without_stopwords.append(str(TextBlob(word).correct()))
    return " ".join(without_stopwords)

In [8]:
stop=stopwords.words("english")
data["content"]=data["content"].apply(lambda content: " ".join(word for word in word_tokenize(content) if word not in stop))
data.tail()

Unnamed: 0,sentiment,content
7510,shame,two years back someone invited tutor granddaug...
7511,shame,taken responsibility something prepared howeve...
7512,fear,home heard loud sound spitting outside door th...
7513,guilt,homework teacher asked us scolded immediately
7514,fear,shouted younger brother always afraid called l...


In [9]:
ps=PorterStemmer()
data["content"]=data["content"].apply(lambda content: ' '.join([ps.stem(word) for word in word_tokenize(content)]))
#data["content"]=data["content"].apply(lambda content: str(TextBlob(content).correct()))
data.tail()

Unnamed: 0,sentiment,content
7510,shame,two year back someon invit tutor granddaught g...
7511,shame,taken respons someth prepar howev fail timid t...
7512,fear,home heard loud sound spit outsid door thought...
7513,guilt,homework teacher ask us scold immedi
7514,fear,shout younger brother alway afraid call loudli


# Label Encoding

In [10]:
data['sentiment'].value_counts()

joy        1091
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1050
Name: sentiment, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['sentiment_label']=le.fit_transform(data['sentiment'])
data['sentiment_label'].value_counts()

4    1091
5    1082
0    1079
2    1076
6    1071
1    1066
3    1050
Name: sentiment_label, dtype: int64

# Train split test

In [12]:
from sklearn.model_selection import train_test_split
X=data['content']
y=data['sentiment_label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,stratify=y)

# Feature engineering

## 1)Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow1=cv.fit_transform(X_train)
bow1_test=cv.transform(X_test)

## 2)TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bow2=tfidf.fit_transform(X_train)
bow2_test=tfidf.transform(X_test)

# Model building

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bow=tfidf.fit_transform(X)
from sklearn.model_selection import cross_val_score
models=[MultinomialNB(),LogisticRegression(max_iter=1000),SVC()]
def compare():
    for model in models:
        cv_score=cross_val_score(model,bow,y,cv=5)
        mean_accuracy=(sum(cv_score)/len(cv_score))*100
        print("cross validation accuracy from ", model," : ",mean_accuracy)
        print('-----------------------------------------------------------')
compare()
        

cross validation accuracy from  MultinomialNB()  :  55.103127079174975
-----------------------------------------------------------
cross validation accuracy from  LogisticRegression(max_iter=1000)  :  57.05921490352629
-----------------------------------------------------------
cross validation accuracy from  SVC()  :  55.99467731204258
-----------------------------------------------------------


## 1)Naive bayes

In [16]:
from sklearn.metrics import accuracy_score
def build_model(model,bow_train,y_train,bow_test):
    model.fit(bow_train,y_train)
    pred=model.predict(bow_test)
    return accuracy_score(y_test,pred)
    

In [17]:
from sklearn.naive_bayes import MultinomialNB
print("accuracy using count vectorizer: ", build_model(MultinomialNB(),bow1,y_train,bow1_test))
print("accuracy using count tfidf: ", build_model(MultinomialNB(),bow2,y_train,bow2_test))


accuracy using count vectorizer:  0.5705162320383182
accuracy using count tfidf:  0.5651942522618414


In [18]:
# tweet=["i am having fun "]
# nb=MultinomialNB()
# nb.fit(bow1,y_train)
# test=cv.transform(tweet)
# prediction=nb.predict(test)[0]
# if (prediction==0):
#     print('fear')
# if (prediction==1):
#     print('anger')
# if (prediction==2):
#     print('joy')
# if (prediction==3):
#     print('sadness')

## 2) Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression
print("accuracy using count vectorizer: ", build_model(LogisticRegression(),bow1,y_train,bow1_test))
print("accuracy using count tfidf: ", build_model(LogisticRegression(),bow2,y_train,bow2_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy using count vectorizer:  0.5593400745077168
accuracy using count tfidf:  0.591271953166578


In [20]:
from sklearn.svm import SVC
print("accuracy using count vectorizer: ", build_model(SVC(),bow1,y_train,bow1_test))
print("accuracy using count tfidf: ", build_model(SVC(),bow2,y_train,bow2_test))

accuracy using count vectorizer:  0.552421500798297
accuracy using count tfidf:  0.5816923895689197
