In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data_train = pd.read_csv("train_split.csv", delimiter = ";", names = ["text", "label"])
data_test = pd.read_csv("test_split.csv", delimiter = ";", names = ["text", "label"])

In [3]:
data_train = data_train.drop(index = 0).reset_index(drop = True)
data_test = data_test.drop(index = 0).reset_index(drop = True)


In [4]:
data_train.head()

Unnamed: 0,text,label
0,i refers of course though i cant help feeling ...,joy
1,im starting to feel that im suffering from fat...,sadness
2,i feel like i probably would have liked this b...,love
3,i didn t really feel awkward at all,sadness
4,im feeling a little grumpy today with the lame...,anger


In [5]:
data_test.head()

Unnamed: 0,text,label
0,ive made it through a week i just feel beaten ...,sadness
1,i feel this strategy is worthwhile,joy
2,i feel so worthless and weak what does he have...,sadness
3,i feel clever nov,joy
4,im moved in ive been feeling kind of gloomy,sadness


In [6]:
data_train["label"].value_counts()

label
joy         4341
sadness     3720
anger       1732
fear        1540
love        1008
surprise     459
Name: count, dtype: int64

In [7]:
def custom_encoder(data):
    data.replace(to_replace = "surprise", value = 1, inplace = True)
    data.replace(to_replace = "joy", value = 1, inplace = True)
    data.replace(to_replace = "love", value = 1, inplace = True)
    data.replace(to_replace = "sadness", value = 0, inplace = True)
    data.replace(to_replace = "anger", value = 0, inplace = True)
    data.replace(to_replace = "fear", value = 0, inplace = True)


custom_encoder(data_train["label"])

In [8]:
data_train.head()

Unnamed: 0,text,label
0,i refers of course though i cant help feeling ...,1
1,im starting to feel that im suffering from fat...,0
2,i feel like i probably would have liked this b...,1
3,i didn t really feel awkward at all,0
4,im feeling a little grumpy today with the lame...,0


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
lm = WordNetLemmatizer()

def text_transformation(data):
  corpus = []
  for sentence in data:
    new_item = re.sub('[^a-zA-Z]', ' ', str(sentence))
    new_item = new_item.lower()
    new_item = new_item.split()
    new_item = [lm.lemmatize(word) for word in new_item if word not in set(stopwords.words('english'))]
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [11]:
corpus = text_transformation(data_train["text"])
corpus[2]

'feel like probably would liked book little bit simple story line'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (1, 2))
X = cv.fit_transform(corpus)
Y = data_train.label

In [13]:
parameters = {
  "max_features": ('auto', 'sqrt'),
  "n_estimators": [500, 1000, 1500],
  "max_depth": [5, 10, None],
  "min_samples_leaf": [1,2,5,10],
  "bootstrap": [True, False]
}

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(), parameters, cv = 5, return_train_score = True, n_jobs = -1)
grid_search.fit(X, Y)
grid_search.best_params_

In [None]:
rfc = RandomForestClassifier(max_features = grid_search.best_params_["max_features"], max_depth = grid_search.best_params_["max_depth"], 
                            n_estimators = grid_search.best_params_["n_estimators"], min_samples_split = grid_search.best_params_["min_samples_split"], 
                            min_samples_leaf = grid_search.best_params_["min_samples_leaf"], bootstrap = grid_search.best_params_["bootstrap"])
rfc.fit(X, Y)

In [None]:
test_data = pd.read_csv("test_split.csv", delimiter = ";", names = ["text", "label"])
X_test, Y_test = test_data.text, test_data.label
Y_test = custom_encoder(Y_test)
X_test = text_transformation(X_test)
X_test = cv.transform(X_test)
y_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

acc_score = accuracy_score(Y_test, y_pred)
print("Accuracy Score - ", acc_score)
report = classification_report(Y_test, y_pred)

In [None]:
def expression_check(inputStr):
    if inputStr == 1:
        print("Input statement has positive sentiment")
    elif inputStr == 0:
        print("Input statement has negative sentiment")
    else:
        print("Invlaid output")

In [None]:
def sentiment_predictor(input):
    inputStr = text_transformation(inputStr)
    transform_text = cv.transform(inputStr)
    prediction = rfc.predict(transform_text)
    expression_check(prediction)

In [None]:
input1 = ["Sometimes I want to punch someone in the face."]
input2 = ["I travelled to Switzerland and The place is beautiful."]

sentiment_predictor(input1)
sentiment_predictor(input2)