In [1]:
import numpy as np
import pandas as pd

# Loading and Cleaning Data

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
import os
#os.chdir("")
!ls

Emotions.txt
Sentiment_Analysis_LSTMs_Keras.ipynb
Sentiment_Analysis_Multiclass_Logistic_Regression.ipynb
glove.6B.200d.txt
glove.6B.50d.txt
smile-annotations-final.csv


In [3]:
df = pd.read_csv('Emotions.txt' ,sep=';', names=['text', 'category'], index_col=False)


In [4]:
df.head()

Unnamed: 0,text,category
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.category.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: category, dtype: int64

In [6]:
possible_labels = df.category.unique()
print(possible_labels)

['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [7]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

print(label_dict)

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}


In [8]:
df['label'] = df.category.replace(label_dict)
df.head(100)

Unnamed: 0,text,category,label
0,i didnt feel humiliated,sadness,0
1,i can go from feeling so hopeless to so damned...,sadness,0
2,im grabbing a minute to post i feel greedy wrong,anger,1
3,i am ever feeling nostalgic about the fireplac...,love,2
4,i am feeling grouchy,anger,1
...,...,...,...
95,i feel like throwing away the shitty piece of ...,sadness,0
96,im starting to feel wryly amused at the banal ...,joy,5
97,i find every body beautiful and only want peop...,joy,5
98,i hear are owners who feel victimized by their...,sadness,0


In [9]:
df = df.drop(['category'], axis = 1)


In [10]:
df.head(20)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
5,ive been feeling a little burdened lately wasn...,0
6,ive been taking or milligrams or times recomme...,3
7,i feel as confused about life as a teenager or...,4
8,i have been with petronas for years i feel tha...,5
9,i feel romantic too,2


# Training/Validation Split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_val, y_train, y_val = train_test_split(df['text'], 
                                                  df['label'], 
                                                  test_size=0.20, 
                                                  random_state=17, 
                                                  stratify = df['label'])

In [13]:
x_train=x_train.to_numpy()
x_val=x_val.to_numpy()
y_train=y_train.to_numpy()
y_val=y_val.to_numpy()

In [14]:
x_train[2]

'i feel sarcastic more often than not'

In [15]:
x_train.shape

(12800,)

In [16]:
y_train[3]

1

In [17]:
print(y_train)

[0 5 1 ... 0 0 0]


# Text Preprocessing 

In [18]:
import string
import re
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hemant./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hemant./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [20]:
def process_text(text):
    '''
    Input: 
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
    
    '''
    # remove number 
    text = re.sub('[0-9]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # remove the dates like Mar 30 2013
    text = re.sub('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2}\s\d{4}', ' ', text)
    text = re.sub(r'//', '', text)
    # tokenize texts
    text_tokens = word_tokenize(text)
   
    text_clean = []
    for word in text_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            text_clean.append(word)
            #stem_word = stemmer.stem(word) # stemming word
            #text_clean.append(stem_word)
  
    return text_clean

# Multiclass logistic regression

In [21]:
## upload word emebedding file on google drive
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words, words_to_index, index_to_words, word_to_vec_map

In [22]:
words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.200d.txt')

In [23]:
def text_to_vec(text, word_to_vec_map):
    
    features = np.zeros((200, ))
    
    for w in text:
  
      if w in words:
        
        features += word_to_vec_map[w]
    features /= len(text)
   
    
    return features

In [24]:
def prepare_data(text_array):
    processed_array = np.zeros((text_array.shape[0], 200))
    processed_text = []
    text_vector = []
    i=0
    for text in text_array:
        #processed_text = process_text(text)
        text_vector = text_to_vec(text, word_to_vec_map)
        processed_array[i] = text_vector
        i=i+1
    return processed_array


In [25]:
x_train_final = prepare_data(x_train)

In [26]:
x_train_final[3]

array([ 5.24005965e-02,  6.23406596e-01, -1.95530123e-01, -1.01776930e-01,
       -5.29944211e-02, -2.71496602e-01, -2.07285351e-01, -1.98864739e-01,
       -3.64125614e-01, -7.49854035e-02, -3.68957965e-01,  1.44838667e-01,
        1.17037211e-01,  6.83835263e-02,  3.80882316e-01, -6.99924561e-02,
       -3.57477282e-01, -7.17485895e-02, -2.48178807e-01, -2.40171789e-01,
       -1.52318860e-02,  1.28829105e+00, -3.69451509e-01,  1.60956333e-01,
        4.50973281e-01, -1.93670484e-01, -2.48443263e-01,  9.49897544e-02,
        1.32197679e-01,  3.63697895e-01, -2.03328491e-01,  4.10633895e-01,
       -2.71663158e-01, -3.45259912e-01, -1.27982047e-01, -8.23743614e-02,
       -1.41997456e-01, -2.47506263e-01, -1.37115263e-01, -1.22747368e-03,
       -2.65120000e-01, -1.26372737e-01,  4.11109474e-02,  2.12000046e-01,
       -2.51730053e-01, -1.44596246e-01,  4.11525556e-01, -4.09895684e-01,
        5.46069825e-02, -1.29201193e-01,  2.23848246e-01, -7.67033358e-02,
        1.73596249e-01,  

In [27]:
x_train_final.shape

(12800, 200)

In [28]:
x_val_final = prepare_data(x_val)

In [29]:
x_val_final.shape

(3200, 200)

In [32]:
from scipy.special import softmax

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

def multi_logistic_model(X, Y, X_val, Y_val, learning_rate = 0.01, num_iterations = 400 ):
    
    np.random.seed(1)
    lamda = 0.1
    m = Y.shape[0]                          # number of training examples
    n_y = 6                                 # number of classes  
    n_h = 200                                # dimensions of the GloVe vectors 
    
    # Initialize parameters using Xavier initialization
    W = np.random.randn(n_h, n_y) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    theta = np.vstack((W,b))
  
    X = np.hstack((X, np.ones((X.shape[0], 1), dtype=X.dtype)))
    X_val = np.hstack((X_val, np.ones((X_val.shape[0], 1), dtype=X_val.dtype)))
    Y_oh = convert_to_one_hot(Y, C = n_y)

    step_list = [] 
    loss_list = []
    train_accuracy_list = []
    val_accuracy_list = []
     



    #def getLoss(w,x,y,lam):
    #m = x.shape[0] #First we get the number of training examples
    #y_mat = oneHotIt(y) #Next we convert the integer class coding into a one-hot representation
    #scores = np.dot(x,w) #Then we compute raw class scores given our input and current weights
    #prob = softmax(scores) #Next we perform a softmax on these scores to get their probabilities
    #loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam/2)*np.sum(w*w) #We then find the loss of the probabilities
    #grad = (-1 / m) * np.dot(x.T,(y_mat - prob)) + lam*w #And compute the gradient for that loss
    #return loss,grad



    for t in range(num_iterations+1):
        z = np.dot(X,theta)
        a = softmax(z)

        cost = (-1/m)*(np.sum(Y_oh*np.log(a))) #+ (lamda/2)*np.sum(theta*theta)
           
        d_theta = (-1/m)*(np.dot(X.T, (Y_oh - a))) #+ lamda*theta

        theta = theta - learning_rate * d_theta
      
        step_list.append(t) 
        loss_list.append(cost)
          
        train_accuracy_list.append(accuracy(X, Y, theta))
        val_accuracy_list.append(accuracy(X_val, Y_val, theta))

        if t % 10 == 0:
            #learning_rate = learning_rate/(1 + 0.001*t)
            print('learning rate is ' + str(learning_rate))
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            print("train_accuracy:" + str(accuracy(X, Y, theta)))
            print("val_accuracy:" + str(accuracy(X_val, Y_val, theta)))
      

        df = pd.DataFrame({
        'step': step_list, 
        'loss': loss_list,
        'train_accuracy' : train_accuracy_list,
        'val_accuracy' : val_accuracy_list
        

    })
    return df, theta
         

def predict(X, theta):
    m = X.shape[0]
    pred = np.zeros((m, 1))
    for j in range(m):                       
        Z = np.dot(X[j],theta)
        A = softmax(Z)
        pred[j] = np.argmax(A)
    return pred


def accuracy(X, Y, theta):
    m = X.shape[0]
    pred = predict(X, theta)
    Accuracy = np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))
    return Accuracy
    

def loss_plot(df):
    return df.plot(
            x='step', 
            y='loss',
            xlabel='step',
            ylabel='loss'
        )  
    
def accuracy_plot(df):
    return df.plot(
            x='step', 
            y=['train_accuracy', 'val_accuracy'],
            xlabel='step',
            ylabel='Accuracy'
        )  

In [33]:
df, theta_final = multi_logistic_model(x_train_final, y_train, x_val_final, y_val, learning_rate = 0.01, num_iterations = 800)

learning rate is 0.01
Epoch: 0 --- cost = 11.37618935991467
train_accuracy:0.205703125
val_accuracy:0.2096875
learning rate is 0.01
Epoch: 10 --- cost = 11.278074064457709
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 20 --- cost = 11.201968805122473
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 30 --- cost = 11.14839748077246
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 40 --- cost = 11.117096738755848
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 50 --- cost = 11.107041225177545
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 60 --- cost = 11.116580264336617
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 70 --- cost = 11.143646027562475
train_accuracy:0.291640625
val_accuracy:0.2915625
learning rate is 0.01
Epoch: 80 --- cost = 11.185979492654164
train_accuracy:0.291640625
val_accuracy:0.291

KeyboardInterrupt: 

In [None]:
loss_plot(df)

In [None]:
accuracy_plot(df)

In [None]:
theta_final.shape

In [None]:
print(theta_final)

In [None]:
## Predict emotion 
input_string = "i feel very angry today"
processed_text = process_text(input_string)
X = text_to_vec(processed_text, word_to_vec_map)
X.shape


In [None]:
X.append(1)
X.shape

In [None]:
prediction = predict(X, theta_final)
print(prediction)
labels = {'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}
for key, value in labels.items():
  if prediction == value:
    print("predicted emotion is " + str(key))

In [None]:
np.savetxt('theta.txt', theta_final)