In [None]:
import csv
import re

import numpy as np
import pandas as pd

from io import StringIO
import requests

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, linear_model, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss, make_scorer, recall_score, precision_score, f1_score
from sklearn.neural_network import MLPClassifier

RANDOM_STATE = 1

In [None]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [None]:
def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [None]:
df_train_dev = load_dataset(url_train_dev)
df_test = load_dataset(url_test)

In [None]:
df_train_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52675 entries, 0 to 52674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   52675 non-null  object
 1   label   52675 non-null  object
dtypes: object(2)
memory usage: 823.2+ KB


In [None]:
# clean data, remove @someone and urls 
def clean(s):
  s = re.sub('http://\S+|https://\S+', '', s)
  s = re.sub('@\S+|@\S+', '', s)
  s = re.sub('#\S+|#\S+','',s)
  return s
  
df_train_dev['tweet'] = df_train_dev['tweet'].apply(lambda s: clean(s))
df_test['tweet'] = df_test['tweet'].apply(lambda s: clean(s))

In [None]:
# Preprocess the training and testing dataframe. 

# shuffle datasets
df_train = shuffle(df_train_dev, random_state = RANDOM_STATE)
df_test = shuffle(df_test, random_state = RANDOM_STATE)
  
X_train = df_train['tweet']
y_train = df_train['label']
X_test = df_test['tweet']
y_test = df_test['label']
  
# use bag of words method to vectorize features based on the corpus in training set
vectorizer = CountVectorizer().fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# numeric encode for y
lbl_enc = preprocessing.LabelEncoder().fit(y_train.values)
y_train = lbl_enc.transform(y_train.values)

In [None]:
# train models
mlp = MLPClassifier(hidden_layer_sizes=(800,),shuffle=True, random_state=RANDOM_STATE, verbose=True, activation='tanh', solver='adam', early_stopping=True)
mlp.fit(X_train, y_train)

Iteration 1, loss = 1.08286788
Validation score: 0.831625
Iteration 2, loss = 0.27861275
Validation score: 0.845103
Iteration 3, loss = 0.10001385
Validation score: 0.748481
Iteration 4, loss = 0.05138546
Validation score: 0.743546
Iteration 5, loss = 0.03680021
Validation score: 0.738610
Iteration 6, loss = 0.03097394
Validation score: 0.733675
Iteration 7, loss = 0.02722776
Validation score: 0.731967
Iteration 8, loss = 0.02519911
Validation score: 0.730638
Iteration 9, loss = 0.02553247
Validation score: 0.733106
Iteration 10, loss = 0.02533378
Validation score: 0.734434
Iteration 11, loss = 0.02260676
Validation score: 0.731777
Iteration 12, loss = 0.02228501
Validation score: 0.723424
Iteration 13, loss = 0.02217279
Validation score: 0.730448
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(800,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [None]:

# predict test labels by the best model of MLPClassifier
y_test_pred = mlp.predict(X_test)
# inversely transform the numeric labels back to string labels
label_pred = lbl_enc.inverse_transform(y_test_pred)

In [None]:
accur = accuracy_score(y_test, label_pred)
print("accuracy of MLP model is: %.3f"% accur)

accuracy of MLP model is: 0.842


In [None]:
# average macro precision and recall 
labels = np.unique(y_test)
ave_precision = precision_score(y_test, label_pred, labels=labels, average='macro')
ave_recall = recall_score(y_test, label_pred, labels=labels, average='macro')
print("average macro (precision, recall) of MLP: ", (ave_precision, ave_recall))

average macro (precision, recall) of MLP:  (0.3799551750905021, 0.24026011207184375)


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# precision and recall of each label
precision = precision_score(y_test, label_pred, labels=labels, average=None).reshape(-1,1)
recall = recall_score(y_test, label_pred, labels=labels, average=None).reshape(-1,1)

# concatenate precision and recall of each model
metrics = np.concatenate([precision, recall],axis=1)

# add column names and row index labels, print metrics 
metrics = pd.DataFrame(metrics, columns=['precision', 'recall'])
metrics.index = labels

print('\nMetrics of Multi-layer perceptron model: \n', metrics)

  _warn_prf(average, modifier, msg_start, len(result))



Metrics of Multi-layer perceptron model: 
          precision    recall
ar        0.991416  0.873346
ar_LATN   0.000000  0.000000
az        0.000000  0.000000
bg        0.000000  0.000000
bs        0.000000  0.000000
ca        0.000000  0.000000
cs        0.000000  0.000000
da        0.000000  0.000000
de        0.972973  0.720000
el        1.000000  0.090909
en        0.957755  0.957755
es        0.947368  0.902439
eu        0.000000  0.000000
fa        1.000000  0.400000
fi        0.000000  0.000000
fr        0.894009  0.866071
he        1.000000  0.071429
hi        1.000000  0.250000
hi-Latn   0.000000  0.000000
hr        0.000000  0.000000
ht        0.000000  0.000000
id        0.924016  0.833537
it        0.964286  0.710526
ja        0.661708  0.969330
ja_LATN   0.000000  0.000000
jv        0.000000  0.000000
km        0.000000  0.000000
ko        0.942857  0.300000
ko_LATN   0.000000  0.000000
la        0.000000  0.000000
lv        0.000000  0.000000
mk        0.000000  0.000000