# Part 1: English -> emoji

## use 1-gram and 2-gram

In [2]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='word')

# Load the training data
train_data = []
with open("train/english_train.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         train_data.append(tknzr.tokenize(line.lower()))
        train_data.append(line.lower())
train_label = []
with open("train/english_train.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        train_label.append(int(line[:-1]))

# Load the testing data
test_data = []
with open("test/english_test.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         test_data.append(tknzr.tokenize(line.lower()))
        test_data.append(line.lower())
test_label = []
with open("test/english_test.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        test_label.append(int(line[:-1]))

## vectorize traning data

In [2]:
vectorizer.fit(train_data)
vectorizer.vocabulary_

{'little': 257379,
 'throwback': 445152,
 'with': 496477,
 'my': 297018,
 'favourite': 151915,
 'person': 340077,
 'water': 483948,
 'wall': 481147,
 'little throwback': 258135,
 'throwback with': 445240,
 'with my': 497666,
 'my favourite': 298080,
 'favourite person': 151939,
 'person water': 340207,
 'water wall': 484050,
 'glam': 179912,
 'on': 323533,
 'user': 467784,
 'yesterday': 505711,
 'for': 161572,
 'kcon': 238775,
 'makeup': 271308,
 'using': 473534,
 'in': 218978,
 'featherette': 152378,
 'glam on': 179928,
 'on user': 324740,
 'user yesterday': 473393,
 'yesterday for': 505776,
 'for kcon': 162535,
 'kcon makeup': 238776,
 'makeup using': 271404,
 'using user': 473567,
 'user in': 470206,
 'in featherette': 219712,
 'democracy': 120941,
 'plaza': 346557,
 'the': 429865,
 'wake': 480708,
 'of': 318234,
 'stunning': 413325,
 'outcome': 331282,
 'decision2016': 119568,
 'nbc': 303957,
 'news': 307628,
 'democracy plaza': 120947,
 'plaza in': 346575,
 'in the': 220985,
 'the

In [3]:
train_X = vectorizer.transform(train_data)
train_Y = train_label

train_X

<90000x513461 sparse matrix of type '<class 'numpy.int64'>'
	with 1735710 stored elements in Compressed Sparse Row format>

## SVM model

In [4]:
from sklearn.svm import LinearSVC
model = LinearSVC(dual=False, C=0.1, verbose=0)

model.fit(train_X, train_Y)

LinearSVC(C=0.1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [5]:
test_X = vectorizer.transform(test_data)
test_Y = test_label

y_pred = model.predict(test_X)
y_pred

array([ 2, 18,  2, ...,  3, 12,  1])

In [6]:
with open("output/gold_labels_english.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        gold = test_Y[i]
        out.write("{}\n".format(gold))
    out.write("\n")

with open("output/predicted_labels_english.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        pred = y_pred[i]
        out.write("{}\n".format(pred))
    out.write("\n")

# python scorer_semeval18.py gold_labels_file predicted_labels_file
import scorer_semeval18
scorer_semeval18.main("output/gold_labels_english.txt", "output/predicted_labels_english.txt")

Macro F-Score (official): 26.549
-----
Micro F-Score: 32.567
Precision: 32.567
Recall: 32.567


# Part 2: Spanish -> emoji

In [3]:
tknzr = TweetTokenizer()
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Load the training data
train_data = []
with open("train/spanish_train.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         train_data.append(tknzr.tokenize(line.lower()))
        train_data.append(line.lower())
train_label = []
with open("train/spanish_train.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        train_label.append(int(line[:-1]))

# Load the testing data
test_data = []
with open("test/spanish_test.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         test_data.append(tknzr.tokenize(line.lower()))
        test_data.append(line.lower())
test_label = []
with open("test/spanish_test.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        test_label.append(int(line[:-1]))

In [4]:
vectorizer.fit(train_data)
print(vectorizer.transform([train_data[0]]).toarray())

train_X = vectorizer.transform(train_data)
train_Y = train_label

train_X

[[0 0 0 ... 0 0 0]]


<19000x126417 sparse matrix of type '<class 'numpy.int64'>'
	with 336610 stored elements in Compressed Sparse Row format>

## multi-model prediction (use LR finally which has higher performance finally)

In [69]:
from sklearn.svm import LinearSVC
SVM_model = LinearSVC(dual=False, C=0.1, verbose=0)

from sklearn.neural_network import MLPClassifier
MLP_model = MLPClassifier(learning_rate_init=0.005, verbose=1)

from sklearn.linear_model import LogisticRegression
LogisticRegression_model = LogisticRegression(multi_class='multinomial', solver='saga')

from sklearn.ensemble import RandomForestClassifier
RandomForest_model = RandomForestClassifier()

In [81]:
def multi_model_train(train_X, train_Y):
    print('RandomForest_model')
    RandomForest_model.fit(train_X, train_Y)

    print('LogisticRegression_model')
    LogisticRegression_model.fit(train_X, train_Y)

    print('SVM_model')
    SVM_model.fit(train_X, train_Y)
    
    import numpy as np

    training_vec = []
    training_vec.append(SVM_model.predict(train_X))
    training_vec.append(RandomForest_model.predict(train_X))
    training_vec.append(LogisticRegression_model.predict(train_X))

    training_vec = np.array(training_vec)
    print(training_vec.shape)
    training_vec = np.rot90(training_vec)
    print(training_vec.shape)

    print('MLP_model')
    MLP_model.fit(training_vec, train_Y)

def multi_model_predict(X):
    training_vec = []
    training_vec.append(SVM_model.predict(X))
    training_vec.append(RandomForest_model.predict(X))
    training_vec.append(LogisticRegression_model.predict(X))

    X_vec = np.array(training_vec)
    print(X_vec.shape)
    X_vec = np.rot90(training_vec)
    print(X_vec.shape)
    print(X_vec[:10], test_Y[:10])

    return LogisticRegression_model.predict(X)

In [82]:
multi_model_train(train_X, train_Y)

RandomForest_model
LogisticRegression_model




SVM_model
(3, 19000)
(19000, 3)
MLP_model
Iteration 1, loss = 2.72769438
Iteration 2, loss = 2.65599988
Iteration 3, loss = 2.65252733
Iteration 4, loss = 2.64700458
Iteration 5, loss = 2.64652764
Iteration 6, loss = 2.64373563
Iteration 7, loss = 2.64464399
Iteration 8, loss = 2.64343260
Iteration 9, loss = 2.64146340
Iteration 10, loss = 2.64115478
Iteration 11, loss = 2.64100623
Iteration 12, loss = 2.64042587
Iteration 13, loss = 2.64004212
Iteration 14, loss = 2.64036430
Iteration 15, loss = 2.63909404
Iteration 16, loss = 2.63977563
Iteration 17, loss = 2.63919987
Iteration 18, loss = 2.63981908
Iteration 19, loss = 2.63793526
Iteration 20, loss = 2.63769332
Iteration 21, loss = 2.63838221
Iteration 22, loss = 2.63830372
Iteration 23, loss = 2.63886352
Iteration 24, loss = 2.63813942
Iteration 25, loss = 2.63782758
Iteration 26, loss = 2.63858911
Iteration 27, loss = 2.63778886
Iteration 28, loss = 2.63909443
Iteration 29, loss = 2.63763086
Iteration 30, loss = 2.63715409
Iterati

In [83]:
test_X = vectorizer.transform(test_data)
test_Y = test_label

y_pred = multi_model_predict(test_X)
y_pred

(3, 1000)
(1000, 3)
[[ 1  2  1]
 [ 1  0  1]
 [11 17 12]
 [ 1  0  0]
 [ 1  0  1]
 [15  0  0]
 [ 1  0  1]
 [ 0  0  0]
 [ 0  1  1]
 [ 1  2  1]] [2, 0, 3, 15, 3, 11, 18, 10, 0, 11]


array([ 0,  0,  0,  2,  2,  1,  0,  9, 14,  4, 15,  7,  0,  2,  2,  2,  0,
        1,  1,  0,  5,  2,  2,  8,  1,  1,  5,  4,  0,  0,  0,  2,  1,  0,
        1,  4,  2, 10,  2,  1,  0,  0,  2,  4,  2,  1,  1,  0,  7,  1,  7,
        2,  0,  0,  0,  0,  1,  0,  2,  0,  0,  0,  0,  0,  1,  0,  1,  2,
        1,  2,  2, 15,  0,  1,  2,  0,  1,  0,  2, 15,  0,  9,  2,  1,  1,
        0,  0,  1,  4,  0,  1,  0,  0,  0,  1,  0,  2,  1,  0,  0,  0,  9,
        0,  8,  1,  1,  0,  0,  1,  2,  2,  1,  1,  0,  1,  2,  0,  2,  0,
        0,  2,  4,  2,  7,  0,  0,  0,  2,  0,  2,  2,  2,  0,  3,  0,  1,
        0,  0,  2,  0,  2,  7,  2,  3,  0,  7,  2,  9,  2,  1,  0,  2,  2,
        1, 10,  0,  1,  6,  1,  0,  0,  2,  0,  1,  4,  0,  0, 11,  1,  1,
        0,  2,  0,  2,  0,  0,  1,  2,  1,  2,  2,  2,  0,  2, 10,  0,  0,
        5,  0,  0,  2,  5,  1,  0,  0,  6,  2,  0,  0,  1,  2,  1,  2,  1,
       14,  1,  2,  0,  0,  1,  2,  0,  0,  0,  5,  2,  0,  1,  0,  2, 17,
        6,  2,  4,  0,  3

In [84]:
with open("output/gold_labels_spanish.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        gold = test_Y[i]
        out.write("{}\n".format(gold))
    out.write("\n")

with open("output/predicted_labels_spanish.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        pred = y_pred[i]
        out.write("{}\n".format(pred))
    out.write("\n")

# python scorer_semeval18.py gold_labels_file predicted_labels_file
import scorer_semeval18
scorer_semeval18.main("output/gold_labels_spanish.txt", "output/predicted_labels_spanish.txt")

Macro F-Score (official): 18.467
-----
Micro F-Score: 32.567
Precision: 32.567
Recall: 32.567


# Part 3: multilingual transfer learning

In [85]:
from translate import Translator
translator= Translator(to_lang="es")

In [86]:
translator.translate("hi")

'hola'

## translate English data into Spanish

In [87]:
en_hashmap = {}
with open("mapping/english_mapping.txt", 'r', encoding='utf8') as f:
    for line in f.readlines():
        line = line.split()
        en_hashmap[line[0]] = line[1]

es_hashmap = {}
with open("mapping/spanish_mapping.txt", 'r', encoding='utf8') as f:
    for line in f.readlines():
        line = line.split()
        es_hashmap[line[1]] = line[0]

en2es_hashmap = {}
for key in en_hashmap:
    if en_hashmap[key] in es_hashmap:
        en2es_hashmap[key] = es_hashmap[en_hashmap[key]]
en2es_hashmap

{'0': '0',
 '1': '1',
 '2': '2',
 '3': '3',
 '5': '4',
 '6': '10',
 '7': '15',
 '8': '11',
 '9': '5',
 '13': '12',
 '14': '7',
 '16': '18',
 '19': '13'}

In [88]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word')

# Load the training data
train_data = []
with open("train/spanish_train.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         train_data.append(tknzr.tokenize(line.lower()))
        train_data.append(line.lower())
train_label = []
with open("train/spanish_train.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        train_label.append(int(line[:-1]))

# load extra data
extra_label = []
missing_pos = []
with open("train/english_train.labels", 'r', encoding='utf8') as f:
    for index, line in enumerate(f.readlines()[:-10000]):
        if line[:-1] in en2es_hashmap:
            extra_label.append(int(line[:-1]))
        else:
            missing_pos.append(index)

extra_data = []
with open("train/extra_data.txt", 'r', encoding='utf8') as f:
    for index, line in enumerate(f.readlines()):
        if index not in missing_pos:
            extra_data.append(line.lower())

train_data += extra_data
train_label += extra_label

# Load the testing data
test_data = []
with open("test/spanish_test.text", 'r', encoding='utf8') as f:
    for line in f.readlines():
#         test_data.append(tknzr.tokenize(line.lower()))
        test_data.append(line.lower())
test_label = []
with open("test/spanish_test.labels", 'r', encoding='utf8') as f:
    for line in f.readlines():
        test_label.append(int(line[:-1]))

In [89]:
vectorizer.fit(train_data)
print(vectorizer.transform([train_data[0]]).toarray())

train_X = vectorizer.transform(train_data)
train_Y = train_label

train_X

[[0 0 0 ... 0 0 0]]


<79941x399707 sparse matrix of type '<class 'numpy.int64'>'
	with 736136 stored elements in Compressed Sparse Row format>

## LR model

In [90]:
from sklearn.linear_model import LogisticRegression
LogisticRegression_model = LogisticRegression(multi_class='multinomial', solver='saga')

LogisticRegression_model.fit(train_X, train_Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [91]:
test_X = vectorizer.transform(test_data)
test_Y = test_label

y_pred = LogisticRegression_model.predict(test_X)
y_pred

array([ 0,  0,  0,  2,  2,  0,  0,  9, 14,  4,  0,  2,  0,  2,  2,  2,  0,
        0,  1,  4,  5,  0,  0,  0,  0,  0,  5,  1,  0,  0,  0,  1,  1,  1,
        1,  4,  0,  1,  0,  1,  0,  0,  2,  1,  2,  2,  0,  0,  7,  0,  2,
        2,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  1,  1,  0,  0,  0,
        1,  0,  1,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,  9,  2,  1,  1,
        0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  2,  1,  0,  0,  0,  9,
        0,  0,  1,  1,  0,  5,  0,  2,  2,  1,  1,  0,  1,  0,  0,  0,  0,
        0,  2,  0,  2,  0,  0,  0,  2,  2,  0,  2,  2,  2,  0,  3,  0,  1,
        0,  1,  2,  0,  1,  3,  2,  0,  0,  0,  2,  0,  1,  0,  1,  0,  2,
        0,  0,  0,  1,  0,  0,  0,  0,  2,  0,  1,  3,  0,  0,  0,  1,  0,
        0,  0,  0,  2,  0,  0,  0,  2,  1,  0,  2,  2,  0,  0, 10,  0,  0,
        5,  0,  0,  2,  5,  1,  0,  0,  0,  1,  0,  0,  0,  2,  0,  2,  2,
       14,  1,  0,  0,  0,  0,  0,  0,  4,  0,  5,  2,  0,  1,  0,  0,  0,
        1,  2,  0,  0,  0

In [92]:
with open("output/gold_labels_spanish_plus.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        gold = test_Y[i]
        out.write("{}\n".format(gold))
    out.write("\n")

with open("output/predicted_labels_spanish_plus.txt", "w", encoding='utf8') as out:
    for i, sent in enumerate(test_data): 
        pred = y_pred[i]
        out.write("{}\n".format(pred))
    out.write("\n")

# python scorer_semeval18.py gold_labels_file predicted_labels_file
import scorer_semeval18
scorer_semeval18.main("output/gold_labels_spanish_plus.txt", "output/predicted_labels_spanish_plus.txt")

Macro F-Score (official): 14.888
-----
Micro F-Score: 27.672
Precision: 27.672
Recall: 27.672
