# Machine Learning

### Model Fitting

Now we build the Pipeline model:

In [2]:
# build pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# import cleaned dataset
import os
dir = os.getcwd()
os.chdir("..")

cleaned_train = pd.read_csv("datasets/clean_train.csv")
os.chdir(dir)


In [3]:
# features and labels
x_train = cleaned_train['clean_text']
y_train = cleaned_train['emotion']

# logisticRegression Pipeline
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])

# train and fit data
pipe_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])

Let's make a prediction based on our model created:

In [4]:
# make a prediction
# Enter a random sentence
exl = "What is wrong with me"

In [5]:
pipe_lr.predict([exl])

array(['sadness'], dtype=object)

Print out the prediction result:

In [6]:
predicted_emotion = str(pipe_lr.predict([exl]))[2:-2]
print("The emotion in this sentence is: %s" %(predicted_emotion))

The emotion in this sentence is: sadness


Predict the probability of each emotions appear in the sentence:

In [7]:
# prediction probability
pipe_lr.predict_proba([exl])

array([[0.14980788, 0.12311021, 0.33324496, 0.03788332, 0.3401243 ,
        0.01582933]])

In [8]:
# prediction classses
pipe_lr.classes_

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
      dtype=object)

Convert the prediction values into DataFrame and concat them with their corresponding classes for a clearer representation:

In [9]:
predicted_probability = pd.DataFrame(pipe_lr.predict_proba([exl]))
predicted_probability = predicted_probability.transpose()
predicted_probability.rename(columns={0:'Probability'},inplace=True)
predicted_probability


Unnamed: 0,Probability
0,0.149808
1,0.12311
2,0.333245
3,0.037883
4,0.340124
5,0.015829


In [10]:
predicted_classes = pd.DataFrame(pipe_lr.classes_)
predicted_classes.rename(columns={0:'Class'},inplace=True)
predicted_classes

Unnamed: 0,Class
0,anger
1,fear
2,joy
3,love
4,sadness
5,surprise


In [11]:
pred_concat = pd.concat([predicted_classes,predicted_probability], axis=1)
pred_concat

Unnamed: 0,Class,Probability
0,anger,0.149808
1,fear,0.12311
2,joy,0.333245
3,love,0.037883
4,sadness,0.340124
5,surprise,0.015829


Import test data:

In [12]:
dir = os.getcwd()
os.chdir("..")

test_data = pd.read_csv("datasets/test.txt", sep = ';')
os.chdir(dir)

In [13]:
# features and labels for test data
x_test = test_data['text']
y_test = test_data['emotion']

Accuracy checking:

In [14]:
# check accuracy using test data
pipe_lr.score(x_test,y_test)

0.885

In [15]:
dir = os.getcwd()
os.chdir("..")

val_data = pd.read_csv("datasets/val.txt", sep = ';')
os.chdir(dir)

x_val = val_data['text']
y_val = val_data['emotion']

pipe_lr.score(x_val,y_val)

0.8985

In [16]:
# Save model & pipeline
import joblib
pipeline_file = open("emotion_classifier_pipe_lr.pkl","wb")
joblib.dump(pipe_lr,pipeline_file)
pipeline_file.close()