In [0]:
import pandas as pd
import numpy as np
import nltk
import os
import re
import string

from copy import deepcopy
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers.core import Dense, Dropout, Activation
# from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras.layers.recurrent import LSTM, GRU
from tensorflow.python.keras.layers.convolutional import Convolution1D, MaxPooling1D

from tensorflow.python.keras.utils import np_utils
from sklearn.metrics import recall_score,precision_score,f1_score,confusion_matrix,accuracy_score

In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Dataset Preprocessing

In [0]:
def transform_data(datafolder,splitname):
    df = open(os.path.join(datafolder,f"{splitname}.txt"),'r',encoding='utf-8').readlines()
    print('Lines in original file = ',len(df))
    c=0
    toggle = 0
    link_idx = ['']*len(df)

    # Storing all indices where there is link(https://*) or tag (@ followed by twitter id) present
    for i in range(len(df)):
        line = df[i].strip().split('\t')
        https_index = 0
        end_index = 0
        if(line[0]=='/' or line[0]=='@'):
            link_idx[c] = i
            link_idx[c+1] = i+1
            c += 2
            toggle = 0
        elif(line[0]=='meta'):
            toggle=0
        elif(line[0]=='https'):  
            link_idx[c] = i
            c += 1
            toggle = 1
            continue
        if(toggle):
            link_idx[c] = i
            c+=1

    link_idx = [i for i in link_idx if type(i)==int]
    i = j = 0
    c = 0
    df_new = ['']*(len(df)-len(link_idx))

    # Removing tokens containing links or tags.
    for j in range(len(link_idx)):
        while(i<len(df) and link_idx[j]>i):
                df_new[c] = df[i].lower()
                c += 1
                i += 1
        if(link_idx[j]==i):
            i += 1
    print('Lines after removing links and tags = ',len(df_new))

    # Segmenting dataset into 4 lists. Tweets, lang_ids, sentiment and tweet_id
    tweets = [0]*len(df_new)
    lang_id = [0]*len(df_new)
    sentiment = [0]*len(df_new)
    tweet_id = [0]*len(df_new)
    c = 0
    tweet = lang = []
    for i in range(len(df_new)):
        line = df_new[i].strip().split('\t')
        if(len(line)<2):
            continue    
        if(line[0]!='meta'):
    #         tweet.append(line[0])
            tweet.append(''.join([i if ord(i) < 128 else '' for i in line[0]]))
            lang.append(line[1])
        else:
            sentiment[c] = line[2]
            tweet_id[c] = line[1]
            if(len(tweet)>0 and len(lang)>0):
                tweets[c] = tweet
                lang_id[c] = lang
            tweet = []
            lang = []
            c+=1
    tweets.append(tweet)
    lang_id.append(lang)

    tweets = [i for i in tweets if(type(i)==list)]
    lang_id = [i for i in lang_id if(type(i)==list)]
    sentiment = [i for i in sentiment if(type(i)==str)]
    tweet_id = [i for i in tweet_id if(type(i)==str)]
    return tweets,sentiment

## Creating dictionary for characters

In [0]:
def create_training_data(tweets,sentiment):
    X_train = [0]*len(tweets)
    Y_train = [0]*len(sentiment)
    for i in range(len(tweets)):
        X_train[i] = list(' '.join(tweets[i]))
        if(sentiment[i]=='negative'):
            Y_train[i] = 0
        elif(sentiment[i]=='neutral'):
            Y_train[i] = 1
        elif(sentiment[i]=='positive'):
            Y_train[i] = 2
    assert(len(X_train) == len(Y_train))
    print('Max length of sequence of characters in dataset = ',max([len(i) for i in X_train]))
    return X_train,Y_train

def create_mappings(data=None):
    char2num = {}
    num2char = {}
    chars = ['PAD','UNK', ' '] + list(string.ascii_lowercase)
    for i in range(len(chars)):
        char2num[chars[i]] = i
        num2char[i] = chars[i]
    return char2num,num2char

def convert_to_numbers(data,char2num,MAXLEN=100):
    out = [0]*len(data)
    for i in range(len(data)):
        if(len(data[0])>1):
            out[i] = [char2num[j] if j in char2num else 1 for j in data[i]]
        elif(len(data[0])==1):
            out[i] = char2num[data[i]]
    out = sequence.pad_sequences(out[:],maxlen=MAXLEN)
    return out
def convert_to_characters(data,num2char):
    out = [0]*len(data)
    for i in range(len(data)):
        if(len(np.shape(data))==2):
            out[i] = [num2char[j] if j in num2char else 'UNK' for j in data[i]]
        elif(len(np.shape(data))==1):
            out[i] = num2char[data[i]]
    return out

## Model

In [0]:
def train_model(X_train,Y_train,X_valid,Y_valid): # default args
    model = Sequential()
    model.add(Embedding(MAX_FEATURES,EMBEDDING_SIZE,input_length=MAXLEN))
    model.add(Convolution1D(
            filters=NB_FILTER,
            kernel_size=FILTER_LENGTH,
            padding='valid',
            activation='relu',
    ))
    model.add(MaxPooling1D(pool_size=POOL_LENGTH))
    model.add(LSTM(
        LSTM_OUTPUT_SIZE,
        dropout=0.2,
        recurrent_dropout=0.2,
        return_sequences=True))
    model.add(LSTM(
    LSTM_OUTPUT_SIZE,
    dropout=0.2,
    recurrent_dropout=0.2,
    return_sequences=False))
    model.add(Dense(NUM_CLASSES))
    model.add(Activation('softmax'))
    
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adamax',
        metrics=['accuracy'])
    print('Model compiled')
    print('Training Started')
    print(model.summary())
    model.fit(X_train,Y_train,
             batch_size=BATCH_SIZE,
             shuffle=True,
             epochs=EPOCHS,
             validation_data=(X_valid,Y_valid))
    return model

In [6]:
MAXLEN = 150
EMBEDDING_SIZE = 128
FILTER_LENGTH = 3
NB_FILTER = 128
POOL_LENGTH = 3
LSTM_OUTPUT_SIZE = 128
BATCH_SIZE = 128
EPOCHS = 25
NUM_CLASSES = 3
tweets,sentiment = transform_data(datafolder = 'My Drive/7th Semester/NLP/Assignment-3/dataset',splitname = 'train')
X_train,Y_train = create_training_data(tweets,sentiment)
char2num,num2char = create_mappings(X_train)
MAX_FEATURES = len(char2num)
assert(len(char2num)==len(num2char))

# print('---------------Some sample tweets after preprocessing------------------')
# for i in range(20):
#     print(''.join(X_train[i]))
X_train = convert_to_numbers(X_train,char2num,MAXLEN)
Y_train = np_utils.to_categorical(Y_train,NUM_CLASSES)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

Lines in original file =  426280
Lines after removing links and tags =  325303
Max length of sequence of characters in dataset =  147


In [7]:
model = train_model(deepcopy(X_train),deepcopy(y_train),deepcopy(X_valid),deepcopy(y_valid))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model compiled
Training Started
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 128)          3712      
_________________________________________________________________
conv1d (Conv1D)              (None, 148, 128)          49280     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 49, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 49, 128)           131584    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_______________________

## Evaluation

In [0]:
def evaluate(X_test,y_test,model):
    y_predicted = model.predict(X_test)
    zeros = np.zeros_like(y_predicted)
    zeros[np.arange(len(y_predicted)),y_predicted.argmax(1)] = 1
    y_predicted = zeros
    assert(np.shape(y_predicted)==np.shape(y_test))
    recall = recall_score(y_test,y_predicted,average=None)
    precision = precision_score(y_test,y_predicted,average=None)
    fscore = f1_score(y_test,y_predicted,average=None)
    # cm = confusion_matrix(y_test,y_predicted)
    # print('Confusion matrix = ',cm)
    # accuracy = cm.diagonal/cm.sum(axis=0)
    accuracy = accuracy_score(y_test,y_predicted)
    return accuracy,recall,precision,fscore

In [24]:
test_tweets,test_sentiment = transform_data(datafolder = 'My Drive/7th Semester/NLP/Assignment-3/dataset',splitname = 'test')
X_test,Y_test = create_training_data(test_tweets,test_sentiment)
X_test = convert_to_numbers(X_test,char2num,MAXLEN)
y_test = np_utils.to_categorical(Y_test,NUM_CLASSES)
accuracy,recall,precision,fscore = evaluate(X_test,y_test,model)
print('\n')
print('Model Evalutaion Results:')
print('Overall Accuracy: ',accuracy)
print('\n')
print('Metrics across negative, neutal and positive classes respectively')
print('Recall across classes:',recall)
print('Precision across classes',precision)
print('F score across classes:',fscore)


Lines in original file =  51959
Lines after removing links and tags =  40096
Max length of sequence of characters in dataset =  141


Model Evalutaion Results:
Overall Accuracy:  0.4505082932049224


Metrics across negative, neutal and positive classes respectively
Recall across classes: [0.45778612 0.61803714 0.22680412]
Precision across classes [0.44283122 0.44721689 0.47826087]
F score across classes: [0.4501845  0.51893096 0.30769231]
