In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls '/content/gdrive/MyDrive/dataTM_Project'  # take a look at the Google Drive content

Mounted at /content/gdrive
ls: cannot access '/content/gdrive/MyDrive/dataTM_Project': No such file or directory


In [None]:
import pandas as pd
dev = pd.read_table('/content/gdrive/MyDrive/TM_Project/dev_set.txt', delimiter="\t")
dev = pd.DataFrame(data=dev)

In [None]:
test = pd.read_table('/content/gdrive/MyDrive/TM_Project/test_set.txt', delimiter="\t")
test = pd.DataFrame(data=test)

In [None]:
test_orig = pd.read_table('/content/gdrive/MyDrive/TM_Project/test_set.txt', delimiter="\t")
test_orig = pd.DataFrame(data=test_orig)

In [None]:
# Label Counter
def label_counter(dataframe, field):
    """
    Function that receives a dataframe and the field whose labels you want to count, and
    returns the amount of examples with those labels in the Pandas dataframe.
    """
    
    count_score = dataframe[field].value_counts()
    
    return count_score


# Word Counter
def word_counter(text_list):
    """
    Function that receives a list of strings and returns the (absolute) frequency of each word in that list of strings.
    """
    words_in_df = ' '.join(text_list).split()
    
    # Count all words 
    freq = pd.Series(words_in_df).value_counts()
    return freq

In [None]:
train_balanced = pd.read_csv('/content/gdrive/MyDrive/TM_Project/training_set_balanced.csv')[['sentence','emotion']]

In [None]:
label_counter(train_balanced, "emotion")

1    2000
2    2000
3    2000
4    2000
5    2000
6    2000
7    2000
8    2000
Name: emotion, dtype: int64

In [None]:
from tqdm import  tqdm
import nltk
import numpy as np
import random

from nltk import ngrams

# Initial Preprocessing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
import string

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [None]:
stop = set(stopwords.words('english'))  # 179 stop words
print(stop)

exclude = set(string.punctuation)  # 32 punctuation
print(exclude)

lemma = WordNetLemmatizer()

{'off', 'nor', 'our', 'after', 'ours', 'where', 'the', 're', 'few', 've', 'ourselves', "hasn't", 'do', 'against', 'i', 'they', "don't", 't', 'yours', 'such', 'into', 'once', 'and', 'itself', 'these', "aren't", 'shan', 'her', 'ain', 'there', 'on', 'them', 'each', 'can', "that'll", 'an', 'than', "mightn't", 'theirs', 'you', 'because', 'when', 'all', 'how', 'until', 'those', 'now', 'aren', 'didn', 'does', "you've", "she's", 'doing', "weren't", 'why', 'haven', 'wouldn', "should've", 'their', 'd', 'both', 'above', 'was', 'but', 'most', "isn't", 'or', 'while', "didn't", 'this', 'is', 'its', 'he', 'hasn', 'by', 'a', 'isn', 'himself', 'that', "needn't", 'his', 'up', "doesn't", 'doesn', 'of', 'as', 'y', 'from', 'then', 'yourselves', 'again', 'any', "shouldn't", 'had', 'hadn', 'if', 'hers', 'ma', 'having', 'did', 'between', 'out', 'just', 'in', 'own', 'themselves', 'not', 'same', 'o', "haven't", 'be', 'yourself', 'being', 'to', 'who', 'more', 'shouldn', 'been', 'through', 'we', 's', 'my', 'durin

In [None]:
import re

def clean(text_list, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it:
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        # LOWERCASE TEXT
        text = text.lower()
        
        # REMOVE NUMERICAL DATA AND PUNCTUATION --> EVERYTHING EXCEPT LETTERS
        text = re.sub("[^a-z!?']", ' ', text)
        
        # REMOVE STOP WORDS
        # text = ' '.join([word for word in text.split() if word not in stop])
        
        # REMOVE HTML TAGS
        text = BeautifulSoup(text).get_text()
    
        
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
            #text = [lem.lemmatize(word) for word in text if not word in stop_words] ?????????????????????????????????????

        
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
            
        # REMOVE EMPTY SPACES
        text =  ' '.join(text.split())
        
        updates.append(text)
        
    return updates


def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"sentence": list_updated}))

In [None]:
train_balanced["sentence"] = train_balanced["sentence"].apply(str)

In [None]:
updates = clean(train_balanced["sentence"], lemmatize = False, stemmer = False)
update_df(train_balanced, updates)

100%|██████████| 16000/16000 [00:03<00:00, 5168.06it/s]


In [None]:
word_counter(updates)[:15]

you       4997
i         3825
the       3446
to        3240
a         2871
?         2575
and       1803
person    1659
it        1588
of        1548
!         1493
that      1432
me        1279
in        1278
is        1260
dtype: int64

In [None]:
updates_dev = clean(dev["sentence"], lemmatize = False, stemmer = False)
update_df(dev, updates_dev)

dev.head(3)

100%|██████████| 1000/1000 [00:00<00:00, 5041.82it/s]


Unnamed: 0,sentence,emotion
0,what happens to the gold in our safe ?,4
1,natural to get cold feet,8
2,not very lucky is he ?,7


In [None]:
updates_test = clean(test["sentence"], lemmatize = False, stemmer = False)
update_df(test, updates_test)

test.head(3)

100%|██████████| 2000/2000 [00:00<00:00, 2559.97it/s]


Unnamed: 0,sentence
0,come let's go get that automobile
1,well some other time then ?
2,he's in trouble boy ?


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 21.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 63.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 16.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
from tensorflow.keras.utils import to_categorical
import transformers
from transformers import AutoTokenizer,TFBertModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [None]:
y_train = to_categorical(train_balanced.emotion)
y_dev = to_categorical(dev.emotion)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert = TFBertModel.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=train_balanced.sentence.tolist(),
    add_special_tokens=True,
    max_length=38,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

x_dev = tokenizer(
    text=dev.sentence.tolist(),
    add_special_tokens=True,
    max_length=38,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_train["input_ids"]

<tf.Tensor: shape=(16000, 38), dtype=int32, numpy=
array([[ 101, 1322, 1104, ...,    0,    0,    0],
       [ 101, 3074,  106, ...,    0,    0,    0],
       [ 101, 1150, 1103, ...,    0,    0,    0],
       ...,
       [ 101, 1131, 1169, ...,    0,    0,    0],
       [ 101, 1440, 1120, ...,    0,    0,    0],
       [ 101, 1115,  112, ...,    0,    0,    0]], dtype=int32)>

In [None]:
x_dev["input_ids"]

<tf.Tensor: shape=(1000, 38), dtype=int32, numpy=
array([[ 101, 1184, 5940, ...,    0,    0,    0],
       [ 101, 2379, 1106, ...,    0,    0,    0],
       [ 101, 1136, 1304, ...,    0,    0,    0],
       ...,
       [ 101, 1508, 1122, ...,    0,    0,    0],
       [ 101, 1169, 1128, ...,    0,    0,    0],
       [ 101, 1128, 1341, ...,    0,    0,    0]], dtype=int32)>

In [None]:
input_ids = x_train["input_ids"]
attention_mask = x_train['attention_mask']

In [None]:
max_len = 38
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(9,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':x_dev['input_ids'],'attention_mask':x_dev['attention_mask']}, y_dev
    ),
  epochs=1,
    batch_size=36
)

  return dispatch_target(*args, **kwargs)




In [None]:
predicted_raw = model.predict({'input_ids':x_dev['input_ids'],'attention_mask':x_dev['attention_mask']})
predicted_raw[0]

array([0.05574567, 0.39466345, 0.5881086 , 0.5002264 , 0.72052556,
       0.15188901, 0.37721   , 0.6877215 , 0.49240294], dtype=float32)

In [None]:
y_predicted = np.argmax(predicted_raw, axis = 1)
y_true = dev.emotion

In [None]:
x_test = tokenizer(
    text=test.sentence.tolist(),
    add_special_tokens=True,
    max_length=38,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# TEST PREDICTION
test_predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
test_y_predicted = np.argmax(test_predicted_raw, axis = 1)

test_deliver = pd.DataFrame({'sentence': test_orig['sentence'], 'emotion': test_y_predicted})

In [None]:
test_deliver["emotion"] = test_deliver["emotion"].apply(str)
test_deliver["answer"] = test_deliver[["sentence","emotion"]].agg("\t".join, axis=1)

In [None]:
from google.colab import files
test_deliver.to_csv('test_deliver_final.csv') 
files.download('test_deliver_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.metrics import classification_report
labels = {"Anger": 1, "Anticipation": 2, "Disgust": 3, "Fear": 4, "Joy": 5, "Sadness": 6, "Surprise": 7, "Trust": 8}
print(classification_report(y_true, y_predicted, target_names=labels.keys()))

              precision    recall  f1-score   support

       Anger       0.58      0.32      0.41       211
Anticipation       0.57      0.46      0.51       170
     Disgust       0.33      0.56      0.42        77
        Fear       0.41      0.41      0.41       104
         Joy       0.61      0.55      0.58        97
     Sadness       0.43      0.44      0.43        87
    Surprise       0.34      0.62      0.44        96
       Trust       0.53      0.53      0.53       158

    accuracy                           0.47      1000
   macro avg       0.47      0.49      0.47      1000
weighted avg       0.50      0.47      0.47      1000



In [None]:
# TRAIN CLASSIFICATION REPORT
train_predicted_raw = model.predict({'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']})
train_y_predicted = np.argmax(train_predicted_raw, axis = 1)
train_y_true = train_balanced.emotion
labels = {"Anger": 1, "Anticipation": 2, "Disgust": 3, "Fear": 4, "Joy": 5, "Sadness": 6, "Surprise": 7, "Trust": 8}
print(classification_report(train_y_true, train_y_predicted, target_names=labels.keys()))

              precision    recall  f1-score   support

       Anger       0.80      0.71      0.75      2000
Anticipation       0.81      0.77      0.79      2000
     Disgust       0.80      0.80      0.80      2000
        Fear       0.85      0.92      0.88      2000
         Joy       0.87      0.90      0.88      2000
     Sadness       0.84      0.86      0.85      2000
    Surprise       0.83      0.89      0.86      2000
       Trust       0.80      0.74      0.77      2000

    accuracy                           0.82     16000
   macro avg       0.82      0.82      0.82     16000
weighted avg       0.82      0.82      0.82     16000



In [None]:
dev.sentence

0                 What happens to the gold in our safe ?
1                             Natural to get cold feet .
2                               Not very lucky , is he ?
3      I'm just a little anxious to get up there and ...
4      Did you think we don't know about your affair ...
                             ...                        
995                     All I ask of you is be careful .
996                          You don't like jazz , pal ?
997                                          Put it on .
998    Can you ever imagine [PERSON] being in a spot ...
999    You think top gun up there will be able to tel...
Name: sentence, Length: 1000, dtype: object

In [None]:
dev_deliver = pd.DataFrame({'sentence': dev['sentence'], 'emotion': y_predicted})

In [None]:
dev_deliver["emotion"] = dev_deliver["emotion"].apply(str)
dev_deliver["answer"] = dev_deliver[["sentence","emotion"]].agg("\t".join, axis=1)

In [None]:
from google.colab import files
dev_deliver.to_csv('dev_deliver_final3.csv') 
files.download('dev_deliver_final3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open("dev_results_final.txt", "w", encoding='utf-8') as output:
    for answer in dev_deliver["answer"].values:
        output.write(answer+'\n')

In [None]:
dev_deliver

Unnamed: 0,sentence,emotion,answer
0,What happens to the gold in our safe ?,4,What happens to the gold in our safe ? 4
1,Natural to get cold feet .,3,Natural to get cold feet . 3
2,"Not very lucky , is he ?",7,"Not very lucky , is he ? 7"
3,I'm just a little anxious to get up there and ...,3,I'm just a little anxious to get up there and ...
4,Did you think we don't know about your affair ...,7,Did you think we don't know about your affair ...
...,...,...,...
995,All I ask of you is be careful .,8,All I ask of you is be careful . 8
996,"You don't like jazz , pal ?",7,"You don't like jazz , pal ? 7"
997,Put it on .,2,Put it on . 2
998,Can you ever imagine [PERSON] being in a spot ...,7,Can you ever imagine [PERSON] being in a spot ...
