In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('wordnet')

In [None]:
!pip install scikit-plot

In [None]:
!pip install wordcloud

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

In [None]:
#NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
#Scikit Learning Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Evaluation Matrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from scikitplot.metrics import plot_confusion_matrix

In [None]:
df_train = pd.read_csv("/content/train.txt", delimiter=';',names=['text','label'])

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_var = pd.read_csv('/content/val.txt', delimiter=';',names=['text','label'])

In [None]:
df_var.shape

### Training data and Validation data are same but it seperate dataframe format
### So Concat that datafram using pd.concat() with index

In [None]:
df = pd.concat([df_train, df_var])

In [None]:
df.head()

In [None]:
#check for the various target labels in our dataset using seaborn.
sns.countplot(data=df, x='label')

### As we can see that, we have 6 labels or targets in the dataset. We can make a multi-class classifier for Sentiment Analysis.
### So, we will merge these labels into two classes, i.e. Positive and Negative sentiment.

### 1. Positive Sentiment(1) – “joy”,”love”,”surprise”
### 2. Negative Sentiment(0) – “anger”,”sadness”,”fear”

In [None]:
#custom encoder to convert categorical target labels to numerical form, i.e. (0 and 1)

def custom_encoder(df):
    df.replace(to_replace ="surprise", value =1, inplace=True)
    df.replace(to_replace ="love", value =1, inplace=True)
    df.replace(to_replace ="joy", value =1, inplace=True)
    df.replace(to_replace ="fear", value =0, inplace=True)
    df.replace(to_replace ="anger", value =0, inplace=True)
    df.replace(to_replace ="sadness", value =0, inplace=True)
custom_encoder(df['label'])

In [None]:
#Check the target variables
sns.countplot(data = df, x= 'label')

##### our target has changed to 0 and 1,i.e. 0 for Negative and 1 for Positive

# Data Pre-processing

Perform some pre-processing on the data before converting it into vectors and passing it to the machine learning model.
1
​

In [None]:
def text_transformation(df_col):
    corpus = []
    lm = WordNetLemmatizer()
    for item in df_col:
        new_item = re.sub('[^a-zA-Z]',' ',str(item))
        new_item = new_item.lower()
        new_item = new_item.split()
        new_item = [lm.lemmatize(word) for word in new_item if word not in set(stopwords.words('english'))]
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus
corpus = text_transformation(df['text'])

In [None]:
plt.figure(figsize=(20, 8))
word_cloud = ''
for row in corpus:
    for word in row:
        word_cloud+= ' '.join(word)

wordcloud = WordCloud(width=1000,height=500,background_color='white',min_font_size=10).generate(word_cloud)
plt.imshow(wordcloud)


In [None]:
cv = CountVectorizer(ngram_range=(1, 2))
traindata = cv.fit_transform(corpus)
X = traindata
y = df.label

### Now comes the machine learning model creation part and in this project, I’m going to use Random Forest Classifier, and we will tune the hyperparameters using GridSearchCV.

In [None]:
parameters = {
    'max_features': ('auto', 'sqrt'),
    'n_estimators': [100, 500, 1000],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 5, 10],
    'bootstrap': [True, False]
}


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
 GBC = GradientBoostingClassifier()

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(),parameters,cv =2, return_train_score =True, n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 100}

In [None]:
for i in range(432):
    print('Parameters: ',grid_search.cv_results_['params'][i])
    print('Mean Test Score: ',grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ',grid_search.cv_results_['rank_test_score'][i])

In [None]:
rfc = RandomForestClassifier(max_features=grid_search.best_params_['max_features'],
                                      max_depth=grid_search.best_params_['max_depth'],
                                      n_estimators=grid_search.best_params_['n_estimators'],
                                      min_samples_split=grid_search.best_params_['min_samples_split'],
                                      min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                      bootstrap=grid_search.best_params_['bootstrap'])
rfc.fit(X,y)

## Test Data Transformation

In [None]:
test_df = pd.read_csv('test.txt',delimiter=';',names=['text','label'])
X_test,y_test = test_df.text,test_df.label
#encode the labels into two classes , 0 and 1
test_df = custom_encoder(y_test)
#pre-processing of text
test_corpus = text_transformation(X_test)
#convert text data into vectors
testdata = cv.transform(test_corpus)
#predict the target
predictions = rfc.predict(testdata)

# Model Evaluation

In [None]:
rcParams['figure.figsize'] = 10,5
plot_confusion_matrix(y_test,predictions)
acc_score = accuracy_score(y_test,predictions)
pre_score = precision_score(y_test,predictions)
rec_score = recall_score(y_test,predictions)
print('Accuracy_score: ',acc_score)
print('Precision_score: ',pre_score)
print('Recall_score: ',rec_score)
print("-"*50)
cr = classification_report(y_test,predictions)
print(cr)

In [None]:
#ROC Curve

predictions_probability = rfc.predict_proba(testdata)
fpr,tpr,thresholds = roc_curve(y_test,predictions_probability[:,1])
plt.plot(fpr,tpr)
plt.plot([0,1])
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
# Predict Custom Input:

def expression_check(prediction_input):
    if prediction_input == 0:
        print("Input statement has Negative Sentiment.")
    elif prediction_input == 1:
        print("Input statement has Positive Sentiment.")
    else:
        print("Invalid Statement.")
# function to take the input statement and perform the same transformations we did earlier
def sentiment_predictor(input):
    input = text_transformation(input)
    transformed_input = cv.transform(input)
    prediction = rfc.predict(transformed_input)
    expression_check(prediction)
input1 = ["Sometimes I just want to punch someone in the face."]
input2 = ["I bought a new phone and it's so good."]
sentiment_predictor(input1)
sentiment_predictor(input2)

