In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

sentimentdf = pd.read_csv("sentiments.txt",sep=";",names=["text","label"])
print(sentimentdf.head(5))
sentimentdf["label"]=sentimentdf["label"].replace({"sadness":1,"anger":1,"fear":1,"joy":0,"love":0,"surprise":0})

X = sentimentdf["text"]
y = sentimentdf["label"]

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=1)

print(ytest.value_counts())
print(ytrain.value_counts())

                                                text    label
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger
1    1704
0    1496
Name: label, dtype: int64
1    7058
0    5742
Name: label, dtype: int64


In [2]:
import nltk
nltk.download('omw-1.4')
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wne = WordNetLemmatizer()
import re

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def transform(data):
    corpus = []
    for i in data:
        newi = re.sub("[^a-zA-Z]"," ",i)
        newi = newi.lower()
        newi = newi.split()
        list1 = [wne.lemmatize(word)for word in newi if word not in stopwords.words("english")]
        corpus.append(" ".join(list1))
    return corpus

Xtraincorpus = transform(Xtrain)

print(Xtraincorpus[0:5])

['feel like innocent victim feel win', 'im feeling little dazed confused today', 'feel acclimated like finally part organization rather timid observer', 'feel totally drained emotionally physically holy spirit never cease fill speak', 'find feeling surprised totally unworthy whenever see face']


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1,2))
Xtrainnew = cv.fit_transform(Xtraincorpus)
print(Xtrainnew.shape)
Xtestcorpus = transform(Xtest)
Xtestnew = cv.transform(Xtestcorpus)
print(Xtestnew.shape)

(12800, 88128)
(3200, 88128)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(Xtrainnew,ytrain)
trainpred = rfc.predict(Xtrainnew)

print(classification_report(ytrain,trainpred))

testpred = rfc.predict(Xtestnew)

print(classification_report(ytest,testpred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5742
           1       1.00      1.00      1.00      7058

    accuracy                           1.00     12800
   macro avg       1.00      1.00      1.00     12800
weighted avg       1.00      1.00      1.00     12800

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1496
           1       0.95      0.96      0.95      1704

    accuracy                           0.95      3200
   macro avg       0.95      0.95      0.95      3200
weighted avg       0.95      0.95      0.95      3200



In [6]:
parameters = {"n_estimators":[100,500,1000],"max_depth":[5,10,None],"min_samples_leaf":[1,2,5]}
from sklearn.model_selection import GridSearchCV
gscv = GridSearchCV(RandomForestClassifier(),parameters,n_jobs=-1)
gscv.fit(Xtrainnew,ytrain)
print(gscv.best_params_)

{'max_depth': None, 'min_samples_leaf': 2, 'n_estimators': 1000}


In [8]:
rfc = RandomForestClassifier(n_estimators=gscv.best_params_["n_estimators"],max_depth=gscv.best_params_["max_depth"],min_samples_leaf=gscv.best_params_["min_samples_leaf"])
rfc.fit(Xtrainnew,ytrain)
testpred = rfc.predict(Xtestnew)
trainpred = rfc.predict(Xtrainnew)

print(classification_report(ytrain,trainpred))
print(classification_report(ytest,testpred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      5742
           1       0.97      0.98      0.98      7058

    accuracy                           0.97     12800
   macro avg       0.98      0.97      0.97     12800
weighted avg       0.97      0.97      0.97     12800

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1496
           1       0.94      0.97      0.96      1704

    accuracy                           0.95      3200
   macro avg       0.96      0.95      0.95      3200
weighted avg       0.95      0.95      0.95      3200



In [9]:
def newSentiment(text):
    trtext = transform(text)
    trtext = cv.transform(trtext)
    textpred = rfc.predict(trtext)
    if textpred[0] == 1:
        print("negative")
    else:
        print("positive")

str1 = ["it is raining outside and i am unhappy"]
str2 = ["the weather is good and i am excited"]

newSentiment(str1)
newSentiment(str2)

negative
positive
