# Second - kaggle
You have to clean the data by doing some pre-processing first. Then, apply feature engineering or any other data mining technique you have or haven't learned in the Data Mining course. The final goal is to learn a model that is able to predict the emotion behind each tweet.

### Outline
###### 1. Data preprocessing
###### 2. Methods
###### 2.1. MLP
###### 2.2. BiLSTM
--- 


### 1. Data Preprocessing

#### (1) import package

In [5]:
import pandas as pd
import matplotlib
import tensorflow as tf
from tensorflow import keras

# for tweet cleaning
import emoji
import re

# feature engineering
import nltk
from sklearn.feature_extraction.text import CountVectorizer

# tokenizer
from keras.preprocessing.text import Tokenizer

# for encoding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import numpy as np

# spilting data for traing and validation
from sklearn.model_selection import train_test_split

# enable gpu 1
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

#### (2) read data

In [8]:
data_identification = pd.read_csv("data_emotion/data_identification.csv", skiprows=1, header=None,names=["tweet_id", "identification"])
train_label = pd.read_csv("data_emotion/emotion.csv", skiprows=1, header=None,names=["tweet_id", "emotion"])
train_label

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [9]:
train_id_df = data_identification.loc[data_identification["identification"] == 'train']
train_id = list(train_id_df['tweet_id'])
test_id_df = data_identification.loc[data_identification["identification"] == 'test']
test_id = list(test_id_df['tweet_id'])
train_id_df

Unnamed: 0,tweet_id,identification
1,0x29e452,train
2,0x2b3819,train
4,0x2a2acc,train
5,0x2a8830,train
6,0x20b21d,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [10]:
raw_data = pd.read_json('data_emotion/tweets_DM.json', lines=True)
raw_data

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets
...,...,...,...,...,...
1867530,827,hashtag_tweets,"{'tweet': {'hashtags': ['mixedfeeling', 'butim...",2015-05-12 12:51:52,tweets
1867531,368,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x29d0...",2017-10-02 17:54:04,tweets
1867532,498,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2a6a...",2016-10-10 11:04:32,tweets
1867533,840,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x24fa...",2016-09-02 14:25:06,tweets


In [11]:
source = raw_data['_source']
source

0          {'tweet': {'hashtags': ['Snapchat'], 'tweet_id...
1          {'tweet': {'hashtags': ['freepress', 'TrumpLeg...
2          {'tweet': {'hashtags': ['bibleverse'], 'tweet_...
3          {'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...
4          {'tweet': {'hashtags': [], 'tweet_id': '0x2de2...
                                 ...                        
1867530    {'tweet': {'hashtags': ['mixedfeeling', 'butim...
1867531    {'tweet': {'hashtags': [], 'tweet_id': '0x29d0...
1867532    {'tweet': {'hashtags': [], 'tweet_id': '0x2a6a...
1867533    {'tweet': {'hashtags': [], 'tweet_id': '0x24fa...
1867534    {'tweet': {'hashtags': ['Sundayvibes'], 'tweet...
Name: _source, Length: 1867535, dtype: object

#### (3) acquire the text we need to analyse

In [12]:
ls = []
for line in source:
    ls.append(line['tweet']['tweet_id'])
ls2 = []
for line in source:
    ls2.append(line['tweet']['text'])
id_text = pd.DataFrame({'tweet_id':ls, 'text':ls2})
id_text

Unnamed: 0,tweet_id,text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,0x28b412,"Confident of your obedience, I write to you, k..."
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us..."


#### (4) simplify text

In [26]:
# data clean
def sentence_remove(tweet):
    r = "[_.!+-=——,$%^，。？、~@#￥%……&*《》<>「」“”‘’（）{}【】□‡()/／\\\[\]\"]"
    tweet = emoji.demojize(tweet)                                         # replace emoji to text
    tweet = re.sub(r'[http|https]*://[a-zA-Z0-9.?/&=:]*', 'http', tweet)  # replace web to http
    tweet = re.sub(r'@\S+', '@user', tweet)                               # replace @users
    tweet = re.sub(r, ' ', tweet)                                         # del special symbols
    tweet = re.sub(r'\s+', ' ', tweet)                                    # del redudant space
    # del not-eng language
    tweet = re.sub('[\u4e00-\u9fa5\uac00-\ud7ff\u3040-\u309f\u30a0-\u30ff]', '', tweet)
    return tweet

In [27]:
id_text['text'] = id_text['text'].apply(sentence_remove)
id_text

Unnamed: 0,tweet_id,text
0,0x376b20,People who post add me on Snapchat must be deh...
1,0x2d5350,user As we see Trump is dangerous to freepres...
2,0x28b412,Confident of your obedience I write to you kno...
3,0x1cd5b0,Now ISSA is stalking Tasha face with tears of ...
4,0x2de201,Trust is not the same as faith A friend is so...
...,...,...
1867530,0x316b80,When you buy the last tickets remaining for a ...
1867531,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,0x2a6a4f,user no card left when I wasn't in so I have ...
1867533,0x24faed,Ah corporate life where you can date LH using ...


#### (5) train data  &  test data

In [28]:
train = pd.merge(id_text, train_label, on='tweet_id')
train

Unnamed: 0,tweet_id,text,emotion
0,0x376b20,People who post add me on Snapchat must be deh...,anticipation
1,0x2d5350,user As we see Trump is dangerous to freepres...,sadness
2,0x1cd5b0,Now ISSA is stalking Tasha face with tears of ...,fear
3,0x1d755c,user user Thx for the BEST TIME tonight What ...,joy
4,0x2c91a8,Still waiting on those supplies Liscus LH,anticipation
...,...,...,...
1455558,0x321566,I'm SO HAPPY NoWonder the name of this show Ha...,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy
1455560,0x2cbca6,there's currently two girls walking around the...,joy
1455561,0x24faed,Ah corporate life where you can date LH using ...,joy


In [31]:
test = pd.merge(id_text, test_id_df, on='tweet_id')
test = test.drop(['identification'], axis=1)
test

Unnamed: 0,tweet_id,text
0,0x28b412,Confident of your obedience I write to you kno...
1,0x2de201,Trust is not the same as faith A friend is so...
2,0x218443,When do you have enough ? When are you satisfi...
3,0x2939d5,God woke you up now chase the day GodsPlan God...
4,0x26289a,In these tough times who do YOU turn to as you...
...,...,...
411967,0x2913b4,For this is the message that ye heard from th...
411968,0x2a980e,There is a lad here which hath five barley lo...
411969,0x316b80,When you buy the last tickets remaining for a ...
411970,0x29d0cb,I swear all this hard work gone pay off one da...


In [122]:
# train distribution
temp = train.groupby('emotion')['text'].count().reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Oranges')

Unnamed: 0,emotion,text
4,joy,516017
1,anticipation,248935
7,trust,205478
5,sadness,193437
2,disgust,139101
3,fear,63999
6,surprise,48729
0,anger,39867


### 2.1 Method
#### 2.1.1 MLP

#### (1) feature engineering - Bag Of Words

In [None]:
BOW_2000 = CountVectorizer(max_features=2000, tokenizer=nltk.word_tokenize)
BOW_2000.fit(train['text'])

#### (2) train data & valid data

In [None]:
# 80% train date for train, 20% train data for validation
train_ratio = int(0.8 * train.shape[0])
train_dataset = train[:train_ratio]
valid_dataset = train[train_ratio:]

#### (3) encoding data

In [None]:
X_train = BOW_2000.transform(train_dataset['text'])
y_train = train_dataset['emotion']
X_valid = BOW_2000.transform(valid_dataset['text'])
y_valid = valid_dataset['emotion']
X_test = BOW_2000.transform(test['text'])

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_valid = label_encode(label_encoder, y_valid)

#### (4) MLP model

In [None]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))
x = model_input

# 1st hidden layer
x = Dense(units=100)(x)
x = ReLU()(x)
# x = Dropout(0.2)(x)

# 2nd hidden layer
x = Dense(units=200)(x)
x = ReLU()(x)
# x = Dropout(0.4)(x)

# 2nd hidden layer
x = Dense(units=200)(x)
x = ReLU()(x)
# x = Dropout(0.2)(x)

# output layer
x = Dense(units=output_shape)(x)
x = Softmax()(x)

model_output = x

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile( optimizer ='adam',
               loss      ='categorical_crossentropy',
               metrics   =['accuracy'] )

# show model construction
model.summary()

#### (5) training model

In [None]:
# training setting
epochs = 40
batch_size = 32

# training!
history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    validation_data = (X_train, y_train))

#### (6) output prediction

In [None]:
pred = model.predict(X_test)
pred_list = label_decode(label_encoder, pred).tolist()
output = pd.DataFrame({'emotion': pred_list})

In [None]:
prediction = pd.concat([test, output], axis=1)
prediction = prediction.drop(columns=['text'])
prediction.rename({'tweet_id': 'id'}, axis=1, inplace=True)
prediction

In [None]:
prediction.to_csv('prediction_5.csv',index=False)

#### 2.1.1 BiLSTM
#### (1) feature engineering

In [32]:
X = train.text.to_numpy().reshape(-1)
y = train.emotion.to_numpy().reshape(-1)
X

array(["People who post add me on Snapchat must be dehydrated Cuz man that's LH ",
       ' user As we see Trump is dangerous to freepress around the world What a LH LH TrumpLegacy CNN',
       'Now ISSA is stalking Tasha face with tears of joy face with tears of joy face with tears of joy LH ',
       ...,
       "there's currently two girls walking around the library just handing out red bulls LH blessyou",
       'Ah corporate life where you can date LH using just the relative anachronism of the last job title that updated it ',
       'Blessed to be living Sundayvibes LH '], dtype=object)

In [35]:
tokenizer = Tokenizer(num_words=10000, oov_token='')
tokenizer.fit_on_texts(X)

#### (2) encoding data

In [38]:
def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=100, padding='post')
    return padded_sequences

In [39]:
encode_X = get_sequences(tokenizer, X)
encode_X_test = get_sequences(tokenizer, X_test)
encode_X_test

array([[  70,   72,  621, ...,    0,    0,    0],
       [   3,   65,   35, ...,    0,    0,    0],
       [  71,  661,   13, ...,    0,    0,    0],
       ...,
       [ 455, 1272,  133, ...,    0,    0,    0],
       [1709, 4456,   34, ...,    0,    0,    0],
       [ 146,    5,   23, ...,    0,    0,    0]], dtype=int32)

In [130]:
cls = train.emotion.unique()
cls

array(['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust',
       'disgust', 'surprise'], dtype=object)

In [131]:
cls_to_id = dict((c, i) for i, c in enumerate(cls))
id_to_cls = dict((v, k) for k, v in cls_to_id.items())

In [133]:
y_encoder = lambda labels: np.array([cls_to_id.get(x) for x in labels])

In [134]:
encode_y = y_encoder(y)
encode_y

array([0, 1, 2, ..., 3, 3, 3])

#### (3) train data & valid data

In [66]:
X_train, X_valid, y_train, y_valid = train_test_split(encode_X, encode_y, test_size=0.2, stratify=encode_y)

#### (4) balance data

In [None]:
# oversampling & undersampling(take long long time)
# from imblearn.over_sampling import SMOTE
# X_train, y_train = SMOTE().fit_resample(X_train, y_train)
# from imblearn.under_sampling import TomekLinks
# X_res, y_res = TomekLinks().fit_resample(X_train, y_train)

#### (5) bilstm model

In [64]:
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(10000, 256, input_length=100),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(CuDNNLSTM(100, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(CuDNNLSTM(100)),
    tf.keras.layers.Dense(8, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 256)          2560000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 200)          286400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               241600    
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 1608      
Total params: 3,089,608
Trainable params: 3,089,608
Non-trainable params: 0
____________________________________________

#### (6) traing model

In [None]:
with tf.device('/gpu:1'):
    history = model.fit(
        X_train, y_train,
        epochs=3,
        batch_size=128,
        validation_data = (X_valid, y_valid)
    )

Epoch 1/3
Epoch 2/3
Epoch 3/3


#### (7) predict test data

In [None]:
pred = model.predict(encode_X_test, batch_size=64)
pred[:5]

array([[6.85520232e-01, 9.05492168e-04, 3.70791968e-04, 1.93090588e-01,
        1.97671223e-04, 1.19416669e-01, 2.75455910e-04, 2.23180788e-04],
       [4.42894846e-01, 6.01801462e-03, 5.59616601e-04, 1.85527876e-01,
        6.98171789e-04, 3.62172246e-01, 1.64707820e-03, 4.82142757e-04],
       [1.48113340e-01, 1.10564664e-01, 3.21119488e-03, 5.71019053e-01,
        5.16959140e-03, 1.31855994e-01, 2.15830300e-02, 8.48317146e-03],
       [3.65934148e-02, 3.10312561e-03, 2.21105153e-03, 7.22815812e-01,
        7.45371857e-04, 2.27550238e-01, 3.90284811e-03, 3.07820737e-03],
       [4.84580070e-01, 5.94164198e-03, 3.05120298e-03, 6.33453280e-02,
        9.75302595e-04, 4.39104348e-01, 1.04179757e-03, 1.96041376e-03]],
      dtype=float32)

#### (8) output prediction

In [137]:
def decode_result(pred_result):
    pred_list = np.argmax(pred_result, axis=1)
    id2cls = np.array([id_to_cls.get(x) for x in pred_list])
    return id2cls

In [138]:
pred_list = decode_result(pred)
output = pd.DataFrame({'emotion': pred_list})
prediction = pd.concat([test, output], axis=1)
prediction = prediction.drop(columns=['text'])
prediction.rename({'tweet_id': 'id'}, axis=1, inplace=True)
prediction.to_csv('prediction_num.csv',index=False)