<a href="https://colab.research.google.com/github/harrymkwn/hackinutu/blob/main/CodeMix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initialization

In [142]:
from tqdm import tqdm

import itertools
import emoji
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.layers import Attention
from tensorflow.keras.layers import Flatten, Dropout, Dense
from tensorflow.keras.layers import LSTM
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.metrics import classification_report

In [14]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#Data Preparation

In [122]:
df_train = pd.read_csv('/content/gdrive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/hinglish-train.csv')
df_test = pd.read_csv('/content/gdrive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/hinglish-test.csv')
df_dev = pd.read_csv('/content/gdrive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/hinglish-dev.csv')


df_train.head()

Unnamed: 0,uids,tokens,labels,sentiment
0,4330,"['nen', 'Ã¡', 'vist', 'bolest', 'vztek', 'smute...","['Eng', 'O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin'...",neutral
1,41616,"['@', 'nehantics', 'Haan', 'yaar', 'neha', 'ðŸ˜”ðŸ˜”...","['O', 'Hin', 'Hin', 'Hin', 'Hin', 'O', 'Hin', ...",neutral
2,6648,"['@', 'RahulGandhi', 'television', 'media', 'c...","['O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin', 'Hin'...",negative
3,2512,"['@', 'AmitShah', '@', 'narendramodi', 'All', ...","['O', 'Hin', 'O', 'Hin', 'Hin', 'Hin', 'Eng', ...",positive
4,610,"['@', 'Nehr', '_', 'who', '@', 'TypoMantri', '...","['O', 'Eng', 'O', 'Eng', 'O', 'Hin', 'O', 'Hin...",neutral


In [26]:
def load_dict_smileys():
    return {
        ":â€‘)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":â€‘D":"smiley",
        "8â€‘D":"smiley",
        "xâ€‘D":"smiley",
        "Xâ€‘D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":â€‘(":"sad",
        ":â€‘c":"sad",
        ":â€‘<":"sad",
        ":â€‘[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'â€‘(":"sad",
        ":'(":"sad",
        ":â€‘P":"playful",
        "Xâ€‘P":"playful",
        "xâ€‘p":"playful",
        ":â€‘p":"playful",
        ":â€‘Ãž":"playful",
        ":â€‘Ã¾":"playful",
        ":â€‘b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Ãž":"playful",
        ":Ã¾":"playful",
        ":b":"playful",
        "<3":"love"
        }

# source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
def load_dict_contractions():
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }


def tweet_cleaning_for_sentiment_analysis(tweet):
    # lower case
    tweet = tweet.lower()
        
    # replace contractions
    CONTRACTIONS = load_dict_contractions()
    tweet = tweet.replace("â€™","'")
    words = tweet.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    tweet = " ".join(reformed)
    
    # standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    
    # replace emoticons
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    
    # demojize emojis
    tweet = emoji.demojize(tweet)
    
    # other cleaning
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split())
    # replace duplicate characters
    tweet = re.sub(r"(.)\1{2,}", r'\1\1', tweet)

    return tweet

def clean(t,l):
    for i in range(len(t)):
        temp = tweet_cleaning_for_sentiment_analysis(' '.join(t[i])).split(' ')
        t[i] = []
        j=0
        while j<len(temp):
            t[i].append(temp[j])
            j+=1
        
    return t, l

In [27]:
df_train.head()

Unnamed: 0,uids,tokens,labels,sentiment
0,4330,"['nen', 'Ã¡', 'vist', 'bolest', 'vztek', 'smute...","['Eng', 'O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin'...",neutral
1,41616,"['@', 'nehantics', 'Haan', 'yaar', 'neha', 'ðŸ˜”ðŸ˜”...","['O', 'Hin', 'Hin', 'Hin', 'Hin', 'O', 'Hin', ...",neutral
2,6648,"['@', 'RahulGandhi', 'television', 'media', 'c...","['O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin', 'Hin'...",negative
3,2512,"['@', 'AmitShah', '@', 'narendramodi', 'All', ...","['O', 'Hin', 'O', 'Hin', 'Hin', 'Hin', 'Eng', ...",positive
4,610,"['@', 'Nehr', '_', 'who', '@', 'TypoMantri', '...","['O', 'Eng', 'O', 'Eng', 'O', 'Hin', 'O', 'Hin...",neutral


In [123]:
df_train['tokens'] = df_train['tokens'].apply(lambda tweet : tweet_cleaning_for_sentiment_analysis(tweet))
df_test['tokens'] = df_test['tokens'].apply(lambda tweet : tweet_cleaning_for_sentiment_analysis(tweet))
df_dev['tokens'] = df_dev['tokens'].apply(lambda tweet : tweet_cleaning_for_sentiment_analysis(tweet))

df_train.head()

Unnamed: 0,uids,tokens,labels,sentiment
0,4330,"['nen', 'Ã¡', 'vist', 'bolest', 'vztek', 'smute...","['Eng', 'O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin'...",neutral
1,41616,"['@', 'nehantics', 'haan', 'yaar', 'neha', ' p...","['O', 'Hin', 'Hin', 'Hin', 'Hin', 'O', 'Hin', ...",neutral
2,6648,"['@', 'rahulgandhi', 'television', 'media', 'c...","['O', 'Eng', 'Eng', 'Eng', 'Eng', 'Hin', 'Hin'...",negative
3,2512,"['@', 'amitshah', '@', 'narendramodi', 'all', ...","['O', 'Hin', 'O', 'Hin', 'Hin', 'Hin', 'Eng', ...",positive
4,610,"['@', 'nehr', '_', 'who', '@', 'typomantri', '...","['O', 'Eng', 'O', 'Eng', 'O', 'Hin', 'O', 'Hin...",neutral


In [42]:
def to_list(df):
  return list(df['uids']),list(df['tokens']),list(df['labels']),list(df['sentiment']),len(list(df['sentiment']))

In [124]:
u_train, t_train, l_train, s_train, max_length = to_list(df_train)
u_dev, t_dev, l_dev, s_dev, max_length_dev = to_list(df_dev)
u_test, t_test, l_test, s_test, max_length_test = to_list(df_test)

In [125]:
tok_w = Tokenizer(char_level=False,lower=True,oov_token='UNK' )
tok_w.fit_on_texts(t_train) 

MAX_LEN = 60
trainInput_w = pad_sequences(tok_w.texts_to_sequences(t_train),
                          maxlen=MAX_LEN, padding="post")

print(trainInput_w[10])
valInput_w = pad_sequences(tok_w.texts_to_sequences(t_dev) ,
                          maxlen=MAX_LEN, padding="post")
testInput_w = pad_sequences(tok_w.texts_to_sequences(t_test),
                          maxlen=MAX_LEN, padding="post")

[   2    2 3388  165   18 2123 9780 3828   74 1579  876    2    2  111
  183 4362    2    2    2    2 9781    8   31  110 9782    7    5    2
    2    4    2    2    6    2    2 9783    3    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [144]:
maxval =0

for i in trainInput_w:
  for num in i:
    if maxval < num:
      maxval = num

print(maxval)

48823


In [143]:
le = preprocessing.LabelEncoder()
le.fit(s_train)
trainLabels = to_categorical(le.transform(s_train))
valLabels = to_categorical(le.transform(s_dev))



In [148]:
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(range(maxval)+1)
for i in trainInput_w:
  a = label_binarizer.transform(trainInput_w)
  print(a)

TypeError: ignored

#classifier 

In [133]:
max_features = len(tok_w.word_index)
maxlen = 60
embedding_size = 100

# Convolution
kernel_size = 5
filters = 128
pool_size = 4


In [140]:
def generate_model():
  model = Sequential()
  model.add(Embedding(max_features+1, embedding_size, input_length=maxlen))
  model.add(Conv1D(filters,kernel_size,padding='valid',activation='relu',strides=1))
  model.add(MaxPooling1D(pool_size=pool_size))
  model.add(Flatten())
  model.add(Dropout(0.3))
  model.add(Dense(3, activation='softmax'))
  model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


  model.summary()
  return model

In [135]:
def generate_lstm():
  model = Sequential()
  model.add(Embedding(max_features+1, embedding_size, input_length=maxlen))
  model.add(LSTM(embedding_size))
  model.add(Dense(10,activation='relu'))
  model.add(Dense(3, activation='softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  model.summary()

  return model

In [136]:
model = generate_model()
type(model)

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 60, 100)           4882400   
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 56, 128)           64128     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 14, 128)           0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 1792)              0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 1792)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 5379      
Total params: 4,951,907
Trainable params: 4,951,907
Non-trainable params: 0
___________________________________________

tensorflow.python.keras.engine.sequential.Sequential

In [137]:
model.fit(trainInput_w, trainLabels,batch_size=32,validation_data=(valInput_w,valLabels),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f90d13a36a0>

In [138]:
lstm = generate_lstm()
type(lstm)

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 60, 100)           4882400   
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_19 (Dense)             (None, 10)                1010      
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 33        
Total params: 4,963,843
Trainable params: 4,963,843
Non-trainable params: 0
_________________________________________________________________


tensorflow.python.keras.engine.sequential.Sequential

In [139]:
lstm.fit(trainInput_w, trainLabels,batch_size=32,validation_data=(valInput_w,valLabels),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f90d114efd0>

In [75]:
predictions = model.predict(testInput_w)
predictions = np.argmax(predictions,axis=-1)

# write predictions to file
with open('preds.txt', 'w') as out:
    out.write('Uid,Sentiment')
    for i, uid in enumerate(u_test):
        if predictions[i] == 0:
            sentiment = 'negative'
        elif predictions[i] == 1:
            sentiment = 'neutral'
        else:
            sentiment = 'positive'
        out.write("\n%s,%s"%(uid, sentiment))

In [76]:
# load correct labels
test = pd.read_csv('Hinglish_test_labels.txt')
# load predictions
preds = pd.read_csv('preds.txt')

# compute evaluation metrics
results = {'preds': classification_report(test['Sentiment'], preds['Sentiment'], labels=['positive', 'neutral', 'negative'], output_dict=True, digits=6)}

In [149]:
# format and print scores
formatted_results = [['model', 'precision', 'recall', 'accuracy', 'f1-score']]
for ki in results.keys():
    scores = results[ki]['macro avg']
    model = [ki, scores['precision'], scores['recall'], results[ki]['accuracy'], scores['f1-score']]
    formatted_results.append(model)
    
formatted_results = pd.DataFrame(formatted_results[1:], columns=formatted_results[0])
print(formatted_results)


   model  precision    recall  accuracy  f1-score
0  preds   0.450229  0.457347      0.44  0.426156
