# Using machine learning for sentiment analysis
The goal of this project is to train a Model for Text Sentiment Classification.

## Data Preprocess & Cleansing

In [5]:
import json
import pandas as pd

df_raw = pd.read_json('dm-lab-2-private-competition/final_posts.json')

print(df_raw.iloc[0,0])
df = pd.DataFrame()
df["post_id"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["post_id"])
df["text"]    = df_raw["root"].apply(lambda x: x["_source"]["post"]["text"])
df["hashtags"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["hashtags"])

{'_type': 'post', '_source': {'post': {'post_id': '0x61fc95', 'text': 'We got the ranch, loaded our guns and sat up till sunrise.', 'hashtags': []}}}


In [6]:
ident = pd.read_csv('dm-lab-2-private-competition/data_identification.csv')
emotion = pd.read_csv('dm-lab-2-private-competition/emotion.csv')
emotion['post_id'] = emotion['id']
df = df.merge(emotion[['post_id', 'emotion']], on='post_id', how='left')
df['ident'] = ident['split']
df

Unnamed: 0,post_id,text,hashtags,emotion,ident
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
4,0xaba820,and that got my head bobbing a little bit.,[],,test
...,...,...,...,...,...
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[],,test
64169,0xb5a35a,Texans and Astros both shut out tonight. Houst...,"[texans, astros, sadness, losers]",sadness,train


In [None]:
train_df = df[df['ident'] == 'train']
test_df =  df[df['ident'] == 'test']

Unnamed: 0,post_id,text,hashtags,emotion,ident
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train
...,...,...,...,...,...
64164,0xd740f2,why is everybody seem sp serious?,[],joy,train
64165,0x99267e,"You can cross fuck off, its 10f all winter in ...",[],anger,train
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train


In [8]:
train_df.head()

Unnamed: 0,post_id,text,hashtags,emotion,ident
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train


## Text Preprocessing
Text Preprocessing is traditionally an important step for Natural Language Processing (NLP) tasks. 

It transforms text into a more digestible form so that deep learning algorithms can perform better.

The Preprocessing steps taken are:

1. Lower Casing: Each text is converted to lowercase.

2. Replacing URLs: Links starting with 'http' or 'https' or 'www' are replaced by '<url>'.

3. Replacing Usernames: Replace @Usernames with word '<user>'. [eg: '@Kaggle' to '<user>'].

4. Replacing Consecutive letters: 3 or more consecutive letters are replaced by 2 letters. [eg: 'Heyyyy' to 'Heyy']

5. Replacing Emojis: Replace emojis by using a regex expression. [eg: ':)' to '<smile>']

6. Replacing Contractions: Replacing contractions with their meanings. [eg: "can't" to 'can not']

7. Removing Non-Alphabets: Replacing characters except Digits, Alphabets and pre-defined Symbols with a space.

In [12]:
import re
contractions = pd.read_csv('dm-lab-2-private-competition/contractions.csv.xls', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

train_df['processed_text'] = train_df.text.apply(preprocess_apply)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['processed_text'] = train_df.text.apply(preprocess_apply)


In [14]:
# the texts now turn into this
count=0
for row in train_df.itertuples():
    print("Text:", row[2])
    print('processed_text:', row[4],"\n")
    count+=1
    if count>10:
        break

    

Text: I bet there is an army of married couples who did the same exact thing.
processed_text: joy 

Text: This could only end badly.
processed_text: fear 

Text: My sister squeezed a lime in her milk when she was 12. Same thing happened, but we told her it would happen AFTER she did it ..
processed_text: joy 

Text: Thank you so much❤️
processed_text: joy 

Text: Stinks because ive been in this program for a year with no pay.....back to the drawing board.
processed_text: joy 

Text: The overall response is try and empower women, abolish prostitution and stop giving lazy men money because they want to live out their idiotic fantasy lives. 
processed_text: anger 

Text: Your market sucks
processed_text: anger 

Text: here’s hoping the same is true for me!
processed_text: joy 

Text: She looks like a televangelist.
processed_text: joy 

Text: Rap that will Cut other raper's throat. Who said that? @Paedeezy #badd #wicked. #bright city lights
processed_text: anger 

Text: She’s a good perso

## Model Training

In [62]:
import numpy as np
import re
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout



X_data = train_df["processed_text"].astype(str).values
y_data = train_df["emotion"].values

# 切 train / test（這裡 5% 當 test）
X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=0,
    stratify=y_data  # 依照情緒比例分層抽樣
)

print("Train size:", len(X_train_text))
print("Test size :", len(X_test_text))

Train size: 45495
Test size : 2395


In [63]:
# 載入 Pretrained Google_news Word2Vec 模型

w2v_path = "dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

embedding_dim = w2v.vector_size  
print("Embedding dim:", embedding_dim)


def text_to_vec(text, model=w2v, embedding_dim=embedding_dim):
    """把一則文字轉成平均的 Word2Vec 向量"""
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")
    tokens = re.findall(r"\w+", text.lower())
    vecs = [model[w] for w in tokens if w in model.key_to_index]
    
    if not vecs:
        # 如果裡面沒有任何在詞向量中的字，就給 0 向量
        return np.zeros(embedding_dim, dtype="float32")
    
    return np.mean(vecs, axis=0)


#  將文字轉成向量 

X_train = np.vstack([text_to_vec(t) for t in X_train_text])
X_test  = np.vstack([text_to_vec(t) for t in X_test_text])

print("X_train shape:", X_train.shape)  # (n_train, embedding_dim)
print("X_test shape :", X_test.shape)

Embedding dim: 300
X_train shape: (45495, 300)
X_test shape : (2395, 300)


In [None]:
# Python can't understand emotions like "sadness" or "joy", so here we have to change all 6 emotions into a numeric category of 0~6
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("num_classes:", num_classes)

Classes: ['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
num_classes: 6


In [68]:
# Building the model
model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(num_classes, activation="softmax"),  
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # y 是整數 label，所以用 sparse
    metrics=["accuracy"],
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
es = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)
rlr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-5,
    verbose=1,
)


model.summary()

In [70]:
# Input training data
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=50,
    callbacks=[es, rlr],
    validation_split=0.1,
    verbose=1,
)

# Evaluation
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")


Epoch 1/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 790us/step - accuracy: 0.6039 - loss: 1.0566 - val_accuracy: 0.5888 - val_loss: 1.0786 - learning_rate: 0.0010
Epoch 2/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 725us/step - accuracy: 0.6100 - loss: 1.0369 - val_accuracy: 0.5895 - val_loss: 1.0857 - learning_rate: 0.0010
Epoch 3/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 754us/step - accuracy: 0.6174 - loss: 1.0176 - val_accuracy: 0.5943 - val_loss: 1.0738 - learning_rate: 0.0010
Epoch 4/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 697us/step - accuracy: 0.6253 - loss: 0.9958 - val_accuracy: 0.5949 - val_loss: 1.0734 - learning_rate: 0.0010
Epoch 5/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674us/step - accuracy: 0.6328 - loss: 0.9709 - val_accuracy: 0.5921 - val_loss: 1.0796 - learning_rate: 0.0010
Epoch 6/50
[1m1223/1280[0m [32m━━━━━━━━━━━━━━━━

##  Model Implementation

In [None]:
# Unclassified text data
test_df['processed_text'] = test_df.text.apply(preprocess_apply)
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['processed_text'] = test_df.text.apply(preprocess_apply)


Unnamed: 0,post_id,text,hashtags,emotion,ident,processed_text
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test,we got the ranch loaded our guns and sat up t...
4,0xaba820,and that got my head bobbing a little bit.,[],,test,and that got my head bobbing a little bit
5,0x66e44d,Same. Glad it's not just out store.,[],,test,same glad it is not just out store
6,0xc03cf5,Like always i will wait and see thanks for the...,[],,test,like always i will wait and see thanks for the...
8,0x02f65a,"There's a bit of room between ""not loving sub-...",[],,test,thereis a bit of room between not loving sub ...
...,...,...,...,...,...,...
64146,0x0f273c,We all do it sometimes don't worry.,[],,test,we all do it sometimes do not worry
64150,0xfc4c5d,This New Year I visited more relatives than us...,[],,test,this new year i visited more relatives than us...
64157,0xb318a3,R u a dad or did ur dad leave u both have bad ...,[],,test,r u a dad or did ur dad leave u both have bad ...
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[],,test,i got my first raspberry from a crowd surfer f...


In [71]:
predict = np.vstack([text_to_vec(t) for t in test_df['processed_text']])
pred_result= model.predict(predict)
pred_result 

[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step


array([[2.22979933e-01, 3.68012721e-03, 2.64608636e-02, 5.12456417e-01,
        7.52074877e-03, 2.26901829e-01],
       [5.79092763e-02, 5.67220151e-03, 6.81349695e-01, 1.44356295e-01,
        3.10226120e-02, 7.96900317e-02],
       [9.01271924e-02, 8.57952144e-03, 7.43259070e-03, 6.33115232e-01,
        1.22691981e-01, 1.38053477e-01],
       ...,
       [2.22188517e-01, 3.17916125e-02, 8.08604062e-03, 4.45347011e-01,
        2.19614059e-01, 7.29727298e-02],
       [3.13539892e-01, 1.63444970e-02, 3.00939441e-01, 3.13508034e-01,
        4.22553048e-02, 1.34128630e-02],
       [1.01556545e-02, 1.40085549e-05, 1.01512715e-05, 9.83254015e-01,
        9.00041414e-05, 6.47614058e-03]], dtype=float32)

In [72]:
# Each row has 6 numbers representing possible emotions. 
print("Classes:", label_encoder.classes_)
pred_result.shape 

Classes: ['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']


(16281, 6)

In [76]:
pred_class_idx = np.argmax(pred_result, axis=1) # For each text choose the most likely emotion
pred_labels = label_encoder.inverse_transform(pred_class_idx) # Changing the numeric emotion category back to words.
result_df = pd.DataFrame({"Text_id" : test_df["post_id"],"text": test_df["text"], "emotion" :pred_labels})
result_df
# From the train we learnt that this model has a 62% accuracy. 

Unnamed: 0,Text_id,text,emotion
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",joy
4,0xaba820,and that got my head bobbing a little bit.,fear
5,0x66e44d,Same. Glad it's not just out store.,joy
6,0xc03cf5,Like always i will wait and see thanks for the...,joy
8,0x02f65a,"There's a bit of room between ""not loving sub-...",joy
...,...,...,...
64146,0x0f273c,We all do it sometimes don't worry.,joy
64150,0xfc4c5d,This New Year I visited more relatives than us...,anger
64157,0xb318a3,R u a dad or did ur dad leave u both have bad ...,joy
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,anger
