In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Upload needed files (if any)
from google.colab import files
uploaded = files.upload()

In [None]:
# Installing needed packages.
!pip install transformers
!pip install flair
#!pip install nlpaug
#!pip install googletrans
!pip install twokenize
import nltk
nltk.download('punkt')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Model,Sequential, load_model
from keras.layers import (Bidirectional, Concatenate, Conv1D, Dense,BatchNormalization,
                          Dropout, Embedding, GlobalMaxPooling1D, Input,
                          LSTM, TimeDistributed, Activation, Flatten, Lambda)
from keras.callbacks import EarlyStopping
import keras
import json

import numpy as np
import pandas as pd
#import nlpaug.augmenter.word as naw
from nltk.tokenize import word_tokenize
import io
from tqdm import tqdm
import twokenize
from gensim.models.wrappers import FastText
import gensim
#from googletrans import Translator
import tensorflow as tf
import flair
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
np.random.seed(1) 

In [None]:
# define paths
root = '/content/drive/My Drive/fake-news/'
word2vec_path = root + 'wiki-news-300d-1M.vec'
new_word2vec_path = root + 'GoogleNews-vectors-negative300.bin'
corona_path = root + "5g_corona_conspiracy.json"
non_path = root + "non_conspiracy.json"
other_path = root + "other_conspiracy.json"

In [None]:
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False)

In [None]:
# load word2vec
def load_vectors(fname, thres=100000):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    i = 0
    for line in tqdm(fin):
        i += 1
        if i > thres:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data
word2vec = load_vectors(word2vec_path, thres=140000)
d = dict(word2vec)

In [None]:
# load data
import json
import numpy as np
corona_data = json.load(open(corona_path))
non_data = json.load(open(non_path))
other_data = json.load(open(other_path))
corona_texts = np.array([corona_data[i]['full_text'] for i in range(len(corona_data))])
non_texts = np.array([non_data[i]['full_text'] for i in range(len(non_data))])
other_texts = np.array([other_data[i]['full_text'] for i in range(len(other_data))])

In [None]:
corona_texts[30:50]

In [None]:
#Augment by inserting similar words
def aug_insert(texts, n=1):
    aug_data = texts.copy()
    # model could be roberta, bert and distilbert; aug_p is percentage of words to be augmented
    aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="insert")
    for text in tqdm(texts):
        augmented_texts = aug.augment(text, n=n)
        aug_data.extend(augmented_texts)
    return aug_data

In [None]:
# Augment by replacing similar words
def aug_sub(texts, n=1):
    aug_data = texts.copy()
    # model could be roberta, bert and distilbert; aug_p is percentage of words to be augmented
    aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
    for text in tqdm(texts):
        augmented_texts = aug.augment(text, n=n)
        aug_data.extend(augmented_texts)
    return aug_data

In [None]:
# Augment by randomly deleting sentecnes
def aug_rm(texts, n=1):
    aug_data = texts.copy()
    # aug_p is percentage of words to be augmented
    aug = naw.RandomWordAug(aug_p=0.3)
    for text in tqdm(texts):
        augmented_texts = aug.augment(text, n=n)
        aug_data.extend(augmented_texts)
    return aug_data

In [None]:
# augment by translation and back translation
def aug_translate(texts, n=1):
    aug_data = texts.copy()
    aug = naw.BackTranslationAug(
    from_model_name='transformer.wmt19.en-de', 
    to_model_name='transformer.wmt19.de-en'
    )
    for text in tqdm(texts):
        augmented_texts = aug.augment(text, n=n)
        aug_data.extend(augmented_texts)
    return aug_data

In [None]:
def aug_translate(texts, n=1):
    aug_data = texts.copy()
    lang_list = ['zh-cn', 'de', 'es', 'vi', 'fr']
    translator = Translator()
    for text in tqdm(texts):
        for lang in lang_list[:n]:
            temp = translator.translate(text, dest=lang)
            augmented_text = translator.translate(temp.text, src=lang, dest='en')
            aug_data.append(augmented_text.text)
    return aug_data

In [None]:
# convert to 2 class
other_texts = np.concatenate([non_texts, other_texts])

In [None]:
# split train/val and execute Augmentation
train_corona_texts, val_corona_texts = train_test_split(corona_texts, test_size=200, random_state=0)
train_other_texts, val_other_texts = train_test_split(other_texts, test_size=200, random_state=0)
#aug_n = len(train_other_texts) // len(train_corona_texts) - 1
#train_corona_texts = aug_translate(train_corona_texts.tolist(), n=4)
#train_other_texts = train_other_texts[-900:]
#train_corona_texts = np.concatenate([train_corona_texts,train_corona_texts,train_corona_texts,train_corona_texts])

In [None]:
# Save aug data
with open(root+'train_corona_aug_translate.json', 'w') as f:
    json.dump(train_corona_texts, f)

In [None]:
# Load aug data
with open(root+'train_corona_aug_translate.json') as f:
    train_corona_texts = json.load(f)

In [None]:
count = 0
val_corona_texts = []
for t1 in corona_texts:
  add = True
  for t2 in train_corona_texts:
    if t1==t2:
      add = False
      break
  if add:
    val_corona_texts.append(t1)
val_corona_texts = np.array(val_corona_texts)
len(val_corona_texts)

In [None]:
def preprocess(sen, tokenize):
    tokens = tokenize(sen)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    return tokens

In [None]:
# Tokenize
tokenize = twokenize.tokenizeRawTweetText #word_tokenize 
train_tokens = [tokenize(sen) for sen in np.concatenate([train_corona_texts,train_other_texts])]
train_labels = np.concatenate([[0]*len(train_corona_texts),[1]*len(train_other_texts)])
val_tokens = [tokenize(sen) for sen in np.concatenate([val_corona_texts,val_other_texts])]
val_labels = np.concatenate([[0]*len(val_corona_texts),[1]*len(val_other_texts)])

In [None]:
# 3 class
train_corona_texts, val_corona_texts = train_test_split(corona_texts, test_size=100, random_state=0)
#train_corona_texts = aug_translate(train_corona_texts.tolist(), n=2)
#train_corona_texts = json.load(open(root+'train_corona_translate_s0_n2.json'))
train_non_texts, val_non_texts = train_test_split(non_texts, test_size=100, random_state=0)
train_non_texts = train_non_texts[:3000]
train_other_texts, val_other_texts = train_test_split(other_texts, test_size=100, random_state=0)
#train_other_texts = aug_translate(train_other_texts.tolist(), n=4)
train_other_texts = json.load(open(root+'train_other_translate_s0_n4.json'))
train_tokens = [word_tokenize(sen) for sen in np.concatenate([train_corona_texts,train_non_texts,train_other_texts])]
val_tokens = [word_tokenize(sen) for sen in np.concatenate([val_corona_texts,val_non_texts,val_other_texts])]
#train_tokens = [twokenize.tokenizeRawTweetText(sen) for sen in np.concatenate([train_corona_texts,train_non_texts,train_other_texts])]
#val_tokens = [twokenize.tokenizeRawTweetText(sen) for sen in np.concatenate([val_corona_texts,val_non_texts,val_other_texts])]
#train_labels = np.concatenate([[0]*len(train_corona_texts),[1]*len(train_non_texts),[2]*len(train_other_texts)])
#val_labels = np.concatenate([[0]*len(val_corona_texts),[1]*len(val_non_texts),[2]*len(val_other_texts)])

# with open(root+'train_corona_translate_s0_n2.json', 'w') as f:
#     json.dump(train_corona_texts, f)
# with open(root+'train_other_translate_s0_n4.json', 'w') as f:
#     json.dump(train_other_texts, f)

In [None]:
all_texts = np.concatenate([corona_texts, other_texts, non_texts])
all_labels = np.concatenate([[0]*len(corona_texts), [1]*len(other_texts), [2]*len(non_texts)])
train_texts, test_texts, train_ys, test_ys = train_test_split(all_texts, all_labels, test_size=0.2, random_state=0)
tokenize = twokenize.tokenizeRawTweetText
train_tokens = [preprocess(sen, tokenize) for sen in train_texts]
val_tokens = [preprocess(sen, tokenize) for sen in test_texts]

In [None]:
# Clean dictionary and save space
news_dict = {}
n_tokens = 0
for c in [train_tokens, val_tokens]:
    for s in c:
        for w in s:
            w = w.lower()
            if w in d.keys():
                news_dict[w] = d[w]
            elif w == 'wuhan':
                news_dict[w] = d['Wuhan']
            elif w == 'covid' or w == 'covid19' or w == 'covid-19' or w == 'covid_19':
                news_dict[w] = d['coronavirus']
            #else:
                #print(w)

In [None]:
# root = '/content/drive/My Drive/fake-news/'
# with open(root+'tokens.json') as f:
#     tokens = json.load(f)
# with open(root+'y_data.json') as f:
#     y = json.load(f)
# with open(root+'news_dict.json') as f:
#     news_dict = json.load(f)

In [None]:
test_texts[inds[0][2]]

In [None]:
np.array(val_tokens)[inds][:5]

In [None]:
# prepare for training
#y = np_utils.to_categorical(np.array(y))
train_labels = train_ys
val_labels = test_ys
train_y = np_utils.to_categorical(train_labels)
val_y = np_utils.to_categorical(val_labels)

In [None]:
n_symbols = len(news_dict.items()) + 1  # amount of words
embedding_weights = np.zeros((n_symbols, 300))  
index = 0
index_dict = {}
for x in news_dict.items():
    index += 1
    word = x[0]
    index_dict[word] = index
    embedding_weights[index, :] = news_dict[word]  # word vectors' metrix, embedding_weight[0]=0 (index begins from 0)

lens = [len([0 for w in x if (w in index_dict.keys() and w.isalpha())]) for x in np.concatenate([train_tokens, val_tokens])]
maxlen = np.amax(lens)
print(np.concatenate([train_tokens, val_tokens])[np.argmax(lens)])
train_x = []
for i, words in enumerate(train_tokens):    
    sen = []
    for i, w in enumerate(words):
        if not w.isalpha():
            continue # Skip punctuations
        try:
            sen.append(index_dict[w])
        except:
            continue   
    train_x.append(sen)
train_x = sequence.pad_sequences(np.array(train_x), maxlen = maxlen)

val_x = []
for i, words in enumerate(val_tokens):    
    sen = []
    for i, w in enumerate(words):
        if not w.isalpha():
            continue # Skip punctuations
        try:
            sen.append(index_dict[w])
        except:
            continue
    val_x.append(sen)
val_x = sequence.pad_sequences(np.array(val_x), maxlen = maxlen)

print(train_x.shape, val_x.shape)


In [None]:
from sklearn.utils import shuffle
train_x, train_y = shuffle(train_x, train_y, random_state=0)

In [None]:
# def split(x, y):
#     train_x = np.concatenate([x[:1000], x[-1300:-100]])
#     train_y = np.concatenate([y[:1000], y[-1300:-100]])
#     test_x = np.concatenate([x[1000:1150], x[2300:2400], x[-100:]])
#     test_y = np.concatenate([y[1000:1150], y[2300:2400], y[-100:]])
#     return train_x, test_x, train_y,train_y
# train_x, test_x, train_y, test_y = split(x, y)

In [None]:

class BaseBiLSTM(object):
    def __init__(self, vocabulary_size, max_sentence_length, labels,
                 embedding_weights, embedding_size=100):
        self.model = None
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.max_sentence_length = max_sentence_length
        self.embedding_weights = embedding_weights
        self.labels = labels
        self.n_labels = 3
    
    def add_input_layer(self):
        return Input(shape=(self.max_sentence_length, ))
        
    def add_embedding_layer(self, layers):
        layers = Embedding(
            input_dim=self.vocabulary_size,
            output_dim=self.embedding_size,
            weights = [self.embedding_weights],
            input_length = self.max_sentence_length)(layers)
        return layers
    
    def add_recurrent_layer(self, layers):
        layers = Bidirectional(
            LSTM(units=256, return_sequences=True,
                 recurrent_dropout=0.3))(layers)
        return layers
    
    def add_output_layer(self, layers):
        layers = Dense(self.n_labels, activation='softmax')(layers)
        return layers
    
    def build(self):
        inputs = self.add_input_layer()
        layers = self.add_embedding_layer(inputs)
        layers = Dropout(0.3)(layers)
        layers = self.add_recurrent_layer(layers)
        layers = Dropout(0.3)(layers)
        layers = Dense(64, activation='relu')(layers)
        layers = Flatten()(layers)
        layers = Dense(64, activation='relu')(layers)
        layers = Dropout(0.3)(layers)
        layers = Dense(32, activation='relu')(layers)
        outputs = self.add_output_layer(layers)        
        
        self.model = Model(inputs=inputs, outputs=outputs)
        opt = keras.optimizers.Adam(learning_rate=0.0005)
        self.model.compile(optimizer='adam', loss='categorical_crossentropy',
                           metrics=['accuracy'])
    
    def fit(self, X_train, y_train, epochs, batch_size=128, validation_split=0.2):
        if self.model is None:
            self.build()

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5,verbose = 2)
        return self.model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                              validation_split=validation_split, callbacks=[early_stopping], shuffle=True, class_weight={0: 0.4, 1: 0.5, 2:0.1})
    
    def predict(self, X_test):
        return np.argmax(self.model.predict(X_test), axis=-1)
    
    def evaluate(self, X_test, y_test, cm=False):
        predictions = np.argmax(self.model.predict(X_test), axis=-1).flatten()
        true_labels = np.argmax(y_test, axis=-1).flatten()
        print(classification_report(true_labels, predictions))
        if cm:
            seaborn.heatmap(
                metrics.confusion_matrix(true_labels, predictions, labels=range(6)))


senti_label = np.array([0,1,2])
model = BaseBiLSTM(
    vocabulary_size=len(index_dict) + 1, max_sentence_length=maxlen, 
    embedding_weights = embedding_weights, labels=senti_label, embedding_size=300)
# model = BaseBiLSTM(
#     vocabulary_size=0, max_sentence_length=maxlen, 
#     embedding_weights = 0, labels=senti_label, embedding_size=0)
model.build()
model.model.summary()


model.fit(X_train=train_x, y_train=train_y, epochs=1)

#model.evaluate(test_x, test_y)

In [None]:
#for i in range(2):
model.fit(X_train=train_x, y_train=train_y, epochs=1)
print(matthews_corrcoef(model.predict(val_x), val_labels))

In [None]:
model.evaluate(val_x, val_y)
import sklearn
sklearn.metrics.roc_auc_score(y_true=val_y, y_score=model.model.predict(val_x), multi_class='ovo')

In [None]:
matthews_corrcoef(val_labels, model.predict(val_x))

In [None]:
model.model.save(root+'best_bilstm_03')

In [None]:
roberta_probs = pd.read_csv(root+'roberta_prob.csv')
roberta_probs.to_numpy()

In [None]:
train_probs = np.concatenate([clf.predict_proba(train_bert), test_model.predict(train_x)], axis=-1) #(#data, 2+2+1)
from sklearn.tree import DecisionTreeClassifier
#ensemble = DecisionTreeClassifier(random_state=0).fit(train_probs, train_labels)
ensemble = MLPClassifier(hidden_layer_sizes=(5,), alpha=0.0005, random_state=0).fit(train_probs, train_labels)

In [None]:
val_probs = np.concatenate([clf.predict_proba(test_bert), test_model.predict(val_x)], axis=-1)
ensemble.score(val_probs, val_labels), matthews_corrcoef(ensemble.predict(val_probs), val_labels)

In [None]:
matthews_corrcoef(np.argmax(test_model.predict(val_x), axis=-1), val_labels)

In [None]:
preds = model.predict(val_x)
inds_lstm = np.where(model.predict(val_x) - val_labels)

In [None]:
for i in inds_lstm[0][:10]:
  print(val_labels[i], test_texts[i])

In [None]:
from keras import backend as K

inp = test_model.input                                           # input placeholder
outputs = [layer.output for layer in test_model.layers]          # all layer outputs
functors = [K.function([inp], [out]) for out in outputs]    # evaluation functions

# Testing
test = np.array([val_x[0]])
layer_outs = [func([test]) for func in functors]

In [None]:
train_lstm_emb = []
for s in train_x:
  test = np.array([s])
  layer_outs = [func([test]) for func in functors]
  emb = layer_outs[-2][0][0]
  train_lstm_emb.append(emb)

In [None]:
layer_outs = [func([train_x]) for func in functors]
train_lstm_emb = layer_outs[-2][0]

In [None]:
layer_outs = [func([val_x]) for func in functors]
val_lstm_emb = layer_outs[-2][0]

In [None]:
print(train_lstm_emb.shape, len(train_labels))
train_both_emb = np.concatenate([train_lstm_emb, train_bert], axis=-1)
val_both_emb = np.concatenate([val_lstm_emb, test_bert], axis=-1)
temp =MLPClassifier(hidden_layer_sizes=(300,), alpha=0.0005, random_state=0, max_iter=100).fit(train_both_emb, train_labels)
temp.score(val_both_emb, val_labels), matthews_corrcoef(temp.predict(val_both_emb), val_labels)

In [None]:
train_probs = np.concatenate([clf.predict_proba(train_bert), model.model.predict(train_x)], axis=-1) #(#data, 2+2+1)
from sklearn.tree import DecisionTreeClassifier
#ensemble = DecisionTreeClassifier(random_state=0).fit(train_probs, train_labels)
ensemble = MLPClassifier(hidden_layer_sizes=(5,), alpha=0.0001, random_state=0).fit(train_probs, train_labels)

In [None]:
val_probs = np.concatenate([clf.predict_proba(test_bert), model.model.predict(val_x)], axis=-1)
ensemble.score(val_probs, val_labels), matthews_corrcoef(ensemble.predict(val_probs), val_labels)

In [None]:
model = BaseBiLSTM(
    vocabulary_size=len(index_dict) + 1, max_sentence_length=maxlen, 
    embedding_weights = embedding_weights, labels=senti_label, embedding_size=300)
# model = BaseBiLSTM(
#     vocabulary_size=0, max_sentence_length=maxlen, 
#     embedding_weights = 0, labels=senti_label, embedding_size=0)
model.build()

In [None]:
from tensorflow import keras
test_model = keras.models.load_model(root+'best_bilstm_03')

In [None]:
good_model = model

In [None]:
import pandas as pd
c_csv = pd.read_csv(root + "corona_data.csv")
c_csv = c_csv.drop(['user/location', 'user/created_at_month', 'user/profile_background_tile', 'user/profile_use_background_image',	'user/has_extended_profile',	'user/default_profile',	'user/default_profile_image'], axis=1)
o_csv = pd.read_csv(root + "other_conspiracy_data.csv")
o_csv = o_csv.drop(['user/location', 'user/created_at_month', 'user/profile_background_tile', 'user/profile_use_background_image',	'user/has_extended_profile',	'user/default_profile',	'user/default_profile_image'], axis=1)
n_csv = pd.read_csv(root + "non_conspiracy_data.csv")
n_csv = n_csv.drop(['user/location', 'user/created_at_month', 'user/profile_background_tile', 'user/profile_use_background_image',	'user/has_extended_profile',	'user/default_profile',	'user/default_profile_image'], axis=1)
c_csv['class'] = 0
n_csv['class'] = 1
# o_csv_three = o_csv.copy()
# o_csv_three['class'] = 2
o_csv['class'] = 1
all_csv = c_csv.append(n_csv).append(o_csv)
#all_three_csv = c_csv.append(n_csv).append(o_csv_three)
#cn_csv = c_csv.append(n_csv)
ys = np.concatenate([np.zeros(len(corona_texts)), np.ones(len(other_texts)-700), np.ones(700)*2])

In [None]:
data = all_csv.to_numpy()[:,:-1]
train_csv, test_csv, train_csv_y, test_csv_y = train_test_split(data, ys, test_size=0.2, random_state=0)

In [None]:
count = [0,0,0]
new_x, new_y = [], []
threshold = 600
for i, x in enumerate(train_csv):
  if count[int(train_csv_y[i])] < threshold:
    count[int(train_csv_y[i])] += 1
    new_x.append(x)
    new_y.append(train_csv_y[i])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(new_x)
new_x = scaler.transform(new_x)
test_csv = scaler.transform(test_csv)

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(30,30,30), learning_rate_init=0.001, random_state=0, max_iter=500).fit(new_x, new_y)
clf.score(test_csv, val_labels)

In [None]:
matthews_corrcoef(clf.predict(test_csv), test_csv_y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=40, max_depth=5, random_state=0)
clf.fit(new_x, new_y)
clf.score(test_csv, val_labels), matthews_corrcoef(clf.predict(test_csv), val_labels)

In [None]:
def get_hashtag(tokens):
    tags = []
    for w in tokens:
      if w[0] == '#':
        tags.append(w)
    return tags
tags_data = [get_hashtag(sen) for sen in train_tokens]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', tokenizer=lambda _: _, preprocessor=lambda _: _, token_pattern=None, ngram_range=(1,1), min_df = 1, stop_words = 'english')
X = np.asarray(tfidf.fit_transform(tags_data).todense())

In [None]:
train_tags, val_tags, train_tags_y, val_tags_y = train_test_split(X, train_labels, test_size=0.2, random_state=0)
tags_clf = MLPClassifier(hidden_layer_sizes=(10,), learning_rate_init=0.001, random_state=0, max_iter=50).fit(train_tags, train_tags_y)
tags_clf.score(val_tags, val_tags_y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
tags_clf = RandomForestClassifier(n_estimators=20, max_depth=3, random_state=0, class_weight={0:0.8,1:0.2})
tags_clf.fit(train_tags, train_tags_y, )
tags_clf.score(val_tags, val_tags_y)

In [None]:
matthews_corrcoef(tags_clf.predict(val_tags), val_tags_y)

In [None]:
train_corona_tokens = [train_x[i] for i in range(len(train_tokens)) if train_labels[i] == 0]
train_other_tokens = [train_x[i] for i in range(len(train_tokens)) if train_labels[i] == 1]
val_corona_tokens = [val_x[i] for i in range(len(val_tokens)) if val_labels[i] == 0]
val_other_tokens = [val_x[i] for i in range(len(val_tokens)) if val_labels[i] == 1]

In [None]:
np.random.seed(0)
negatives_train = []
train_labels_pair = []
ref_size = 30
ref_tokens = train_other_tokens[-ref_size:]
train_other_tokens = train_other_tokens[:-ref_size]
for c_t in train_corona_tokens:
    for _ in range(10):
        i = np.random.randint(0, len(ref_tokens))
        o_t = ref_tokens[i]
        sample = np.concatenate([[c_t], [o_t]])
        negatives_train.append(sample)
        train_labels_pair.append(0)
positives_train = []
for c_t in train_other_tokens:
    for _ in range(2):
        i = np.random.randint(0, len(ref_tokens))
        o_t = ref_tokens[i]
        sample = np.concatenate([[c_t], [o_t]])
        positives_train.append(sample)
        train_labels_pair.append(1)
train_x_pair = np.concatenate([negatives_train, positives_train])
train_y_pair = np_utils.to_categorical(train_labels_pair)

negatives_val = []
val_labels_pair = []
for c_t in val_corona_tokens:
    for _ in range(5):
        i = np.random.randint(0, len(ref_tokens))
        o_t = ref_tokens[i]
        sample = np.concatenate([[c_t], [o_t]])
        negatives_val.append(sample)
        val_labels_pair.append(0)
positives_val = []
for c_t in val_other_tokens:
    for _ in range(5):
        i = np.random.randint(0, len(ref_tokens))
        o_t = ref_tokens[i]
        sample = np.concatenate([[c_t], [o_t]])
        positives_val.append(sample)
        val_labels_pair.append(1)
val_x_pair = np.concatenate([negatives_val, positives_val])
val_y_pair = np_utils.to_categorical(val_labels_pair)

In [None]:
len(train_x_pair)

In [None]:
class PairBiLSTM(object):
    def __init__(self, vocabulary_size, max_sentence_length, labels,
                 embedding_weights, embedding_size=100):
        self.model = None
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.max_sentence_length = max_sentence_length
        self.embedding_weights = embedding_weights
        self.labels = labels
        self.n_labels = 2
    
    def add_input_layer(self):
        return Input(shape=(2*self.max_sentence_length, ))
        
    def add_embedding_layer(self, layers):
        layers = Embedding(
            input_dim=self.vocabulary_size,
            output_dim=self.embedding_size,
            weights = [self.embedding_weights],
            input_length = self.max_sentence_length)(layers)
        return layers
    
    def add_recurrent_layer(self, layers):
        layers = Bidirectional(
            LSTM(units=256, return_sequences=True,
                 recurrent_dropout=0.3))(layers)
        return layers
    
    def add_output_layer(self, layers):
        layers = Dense(self.n_labels, activation='softmax')(layers)
        return layers
    
    def build(self):
        # split here
        inputs = self.add_input_layer()
        split = Lambda(lambda x: tf.split(x,num_or_size_splits=2, axis=1))(inputs)
        inputs0 = split[1]
        inputs1 = split[0]
        layers = self.add_embedding_layer(inputs0)
        layers = Dropout(0.5)(layers)
        layers = self.add_recurrent_layer(layers)
        layers = Dropout(0.5)(layers)
        layers = Dense(32, activation='relu')(layers)
        layers = Flatten()(layers)
        layers = Dense(32, activation='relu')(layers)
        layers = Dropout(0.5)(layers)
        layers0 = Dense(16, activation='relu')(layers)

        #inputs1 = Lambda(lambda x: tf.split(x,num_or_size_splits=2))(inputs)
        layers = self.add_embedding_layer(inputs1)
        layers = Dropout(0.5)(layers)
        layers = self.add_recurrent_layer(layers)
        layers = Dropout(0.5)(layers)
        layers = Dense(32, activation='relu')(layers)
        layers = Flatten()(layers)
        layers = Dense(32, activation='relu')(layers)
        layers = Dropout(0.5)(layers)
        layers1 = Dense(16, activation='relu')(layers)

        layers = Concatenate()([layers0, layers1])
        outputs = self.add_output_layer(layers)        
        
        self.model = Model(inputs=inputs, outputs=outputs)
        self.model.compile(optimizer='adam', loss='categorical_crossentropy',
                           metrics=['accuracy'])
    
    def fit(self, X_train, y_train, epochs, batch_size=128, validation_split=0.2):
        if self.model is None:
            self.build()

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5,verbose = 2)
        return self.model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                              validation_split=validation_split, callbacks=[early_stopping], shuffle=True)
    
    def predict(self, X_test):
        return np.argmax(self.model.predict(X_test), axis=-1)
    
    def evaluate(self, X_test, y_test, cm=False):
        predictions = np.argmax(self.model.predict(X_test), axis=-1).flatten()
        true_labels = np.argmax(y_test, axis=-1).flatten()
        print(classification_report(true_labels, predictions))
        if cm:
            seaborn.heatmap(
                metrics.confusion_matrix(true_labels, predictions, labels=range(6)))


senti_label = np.array([0,1])
model = PairBiLSTM(
    vocabulary_size=len(index_dict) + 1, max_sentence_length=maxlen, 
    embedding_weights = embedding_weights, labels=senti_label, embedding_size=300)
model.build()
model.model.summary()


model.fit(X_train=np.array([np.concatenate([x[0], x[1]]) for x in train_x_pair]), y_train=train_y_pair, epochs=1)

In [None]:
model.fit(X_train=np.array([np.concatenate([x[0], x[1]]) for x in train_x_pair]), y_train=train_y_pair, epochs=1)

In [None]:
model.evaluate(np.array([np.concatenate([x[0], x[1]]) for x in val_x_pair]), val_y_pair)
import sklearn
sklearn.metrics.roc_auc_score(y_true=val_y_pair, y_score=model.model.predict(np.array([np.concatenate([x[0], x[1]]) for x in val_x_pair])), multi_class='ovo')

In [None]:
matthews_corrcoef(model.predict(np.array([np.concatenate([x[0], x[1]]) for x in val_x_pair])), val_labels_pair)

In [None]:
y_score = model.model.predict(np.array([np.concatenate([x[0], x[1]]) for x in val_x_pair]))

In [None]:
sklearn.metrics.roc_auc_score(y_true=[val_y_pair[5*i] for i in range(400)], y_score=[y_score[5*i] for i in range(400)], multi_class='ovo')
#sklearn.metrics.roc_auc_score(y_true=[val_y_pair[5*i] for i in range(400)], y_score=[np.mean(y_score[5*i:5*(i+1)], axis=0) for i in range(400)], multi_class='ovo')

In [None]:
correct = np.argmax([val_y_pair[5*i] for i in range(400)], axis=-1) - np.argmax([np.mean(y_score[5*i:5*(i+1)], axis=0) for i in range(400)], axis=-1)
np.count_nonzero(correct) / 400

In [None]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
embedding = TransformerDocumentEmbeddings('roberta-base')
#embedding = TransformerDocumentEmbeddings('xlnet-base-cased')

In [None]:
def get_emb(sen):
  sentence = Sentence(sen)
  embedding.embed(sentence)
  emb = sentence.get_embedding().cpu().detach().numpy()
  sentence = None
  return emb

In [None]:
corona_emb = [get_emb(sen) for sen in corona_texts]
other_emb = [get_emb(sen) for sen in tqdm(other_texts)]
#non_emb = [get_emb(sen) for sen in non_texts]

In [None]:
corona_bert = np.array(corona_emb).reshape((len(corona_texts), 768))
#non_bert = np.array(non_emb).reshape((len(non_texts), 768))
other_bert = np.array(other_emb).reshape((len(other_texts), 768))

In [None]:
other_bert.shape

In [None]:
data = np.concatenate([corona_bert, other_bert])
ys = np.concatenate([np.zeros(len(corona_bert)), np.ones(len(other_bert)-700), np.ones(700)*2])
train_bert, test_bert, train_ys, test_ys, train_texts, test_texts = train_test_split(data, ys, np.concatenate([corona_texts, other_texts]), test_size=0.2, random_state=0)

In [None]:
# Best bert:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(200,200), learning_rate_init=0.0005, random_state=4, max_iter=100).fit(train_bert, train_ys)
clf.score(test_bert, test_ys)

In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(clf.predict(test_bert), test_ys)

In [None]:
test_emb = [get_emb(sen) for sen in tqdm(test_text)]

In [None]:
test_preds = clf.predict(test_emb)

In [None]:
final  =[]
for x in test_preds:
    if x == 0:
      final.append(1)
    elif x==2:
      final.append(2)
    elif x==1:
      final.append(3)
len(final)

In [None]:
test_data[0]['id']

In [None]:
np.unique(test_preds, return_counts=True)

In [None]:
for i in range(20):
  print(test_preds[i], test_data[i]['full_text'])

In [None]:
test_preds=final
with open(root+"roberta-3.txt", "w") as text_file:
  for i in range(len(test_data)):
    text_file.write(test_data[i]['id_str']+','+str(int(test_preds[i]))+'\n')
text_file.close()

In [None]:
train_both_emb = np.concatenate([train_lstm_emb, train_bert], axis=-1)
val_both_emb = np.concatenate([val_lstm_emb, test_bert], axis=-1)
temp =MLPClassifier(hidden_layer_sizes=(300,), alpha=0.0005, random_state=0, max_iter=100).fit(train_both_emb, train_labels)
temp.score(val_both_emb, val_labels), matthews_corrcoef(temp.predict(val_both_emb), val_labels)

In [None]:
test_data = json.load(open(root+'test_tweets.json'))
test_text = np.array([test_data[i]['full_text'] for i in range(len(test_data))])
tokenize = twokenize.tokenizeRawTweetText
test_token = [preprocess(sen, tokenize) for sen in test_text]


In [None]:
len(test_text)

In [None]:
test_x = []
for i, words in enumerate(test_token):    
    sen = []
    for i, w in enumerate(words):
        if not w.isalpha():
            continue # Skip punctuations
        try:
            sen.append(index_dict[w])
        except:
            continue
    test_x.append(sen)
test_x = sequence.pad_sequences(np.array(test_x), maxlen = 55)

In [None]:
test_pred = np.argmax(good_model.model.predict(test_x), axis=-1)

In [None]:
for i in range(10):
  print(test_pred[i], test_text[i])

In [None]:
np.unique(test_pred, return_counts=True)

In [None]:
test_model

In [None]:
import matplotlib.pyplot as plt
plt.hist(clf.predict_proba(test_bert)[inds].flatten())

In [None]:
plt.hist(clf.predict_proba(test_bert)[inds].flatten())

In [None]:
inds = np.where(clf.predict(test_bert) - test_ys != 0)
probs = clf.predict_proba(test_bert)
for i in inds[0]:
  print(test_ys[i],  probs[i], test_texts[i])

In [None]:
corona_texts[0:10]

In [None]:
from nltk.corpus import stopwords

In [None]:
ref = corona_texts[:200]

In [None]:
twokenize.tokenizeRawTweetText

In [None]:
# Kmeans cluster
from sklearn.cluster import KMeans
#kmeans = KMeans(n_clusters=10, random_state=0).fit(corona_bert)
kmeans = KMeans(n_clusters=10, random_state=0).fit(train_bert)
#np.count_nonzero(kmeans.predict(train_bert) - train_ys) / len(train_ys)

# Check cluster distribution
corona_clusters = kmeans.predict(corona_bert)
np.unique(corona_clusters, return_counts=True)

In [None]:
train_b, test_b, train_kmeans, test_kmeans = train_test_split(corona_bert, corona_clusters, test_size=0.2, random_state=0)
clf = MLPClassifier(hidden_layer_sizes=(400,), learning_rate_init=0.001, random_state=0, max_iter=300).fit(train_b, train_kmeans)
clf.score(test_b, test_kmeans)

In [None]:
_, corona_ref = train_test_split(corona_bert, test_size=0.05, random_state=1)

In [None]:
#train_pairs, test_pairs, train_ys, test_ys = train_test_split(, pair_ys, test_size=0.2, random_state=0)

In [None]:
np.random.seed(0)
pair_Xs = []
pair_ys = []
ref_size = {0:5, 1:5}
for i, x in enumerate(train_bert):
    for _ in range(ref_size[train_ys[i]]):
        pair_Xs.append(np.concatenate([x, corona_ref[np.random.randint(0, len(corona_ref))]]))
        pair_ys.append(train_ys[i])
pair_Xs = np.array(pair_Xs)
pair_ys = np.array(pair_ys)

pair_Xs_test = []
pair_ys_test = []
ref_size = {0:5, 1:5}
for i, x in enumerate(test_bert):
    for _ in range(ref_size[test_ys[i]]):
        pair_Xs_test.append(np.concatenate([x, corona_ref[np.random.randint(0, len(corona_ref))]]))
        pair_ys_test.append(test_ys[i])
pair_Xs_test = np.array(pair_Xs_test)
pair_ys_test = np.array(pair_ys_test)
print(pair_Xs.shape, pair_Xs_test.shape)

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(400,400), learning_rate_init=0.0005, random_state=0, max_iter=100).fit(pair_Xs, pair_ys)
clf.score(pair_Xs_test, pair_ys_test)

In [None]:
np.bincount(preds[0:5].astype(int))

In [None]:
preds = clf.predict(test_pairs)
preds_ori = [np.argmax(np.bincount(preds[5*i:5*(i+1)].astype(int))) for i in range(len(preds) // 5)]
test_ys_ori = [test_ys[5*i] for i in range(len(preds) // 5)]

In [None]:
matthews_corrcoef(preds_ori, test_ys_ori)

In [None]:
np.sum((corona_bert[1] - corona_bert[0])**2), np.sum((corona_bert[1] - corona_bert[12])**2)

In [None]:
# Check clusters
for i in range(10):
  print(corona_texts[corona_clusters==i][:3])
  print('---')

In [None]:
inds = np.where(clf.predict(test_bert) - test_ys != 0)
for i in inds[0]:
  print(test_ys[i], test_texts[i])

In [None]:
np.unique(clf.predict(corona_bert), return_counts=True)

In [None]:
corona_
agree = []

In [None]:
from transformers import AutoModel, BertTokenizerFast
import transformers

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
tokens_bert_corona = tokenizer.batch_encode_plus(
    corona_texts.tolist(),
    max_length = 60,
    pad_to_max_length=True,
    truncation=True
)
tokens_bert_non = tokenizer.batch_encode_plus(
    non_texts.tolist(),
    max_length = 60,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
tokens_bert_non.keys()

In [None]:
import torch

In [None]:
sent_id = torch.tensor(tokens_bert_non['input_ids'][0]).unsqueeze(0)
mask = torch.tensor(tokens_bert_non['attention_mask'][0]).unsqueeze(0)
r = bert(sent_id, mask)

In [None]:
r[0].size()

In [None]:
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")
print(os.listdir())

In [None]:
from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

In [None]:
jieba.lcut("啊啊")

In [None]:
test = ["2333","哈哈哈","哈哈哈哈", "哈哈哈哈哈","哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈"]
max = 0
x = []
for s in test:    
    sen = []

    words = jieba.lcut(s)
    print(words)
    if len(words) > max:
        max = len(words)

    for w in words:
        try:
            sen.append(index_dict[w])
        except:
            sen.append(0)
    x.append(sen)
x = sequence.pad_sequences(np.array(x), maxlen = 25)

In [None]:
index_dict['ohhh']

In [None]:
model.model.predict(x)

In [None]:
model.model.save("review_emotion_classifier.h5")
from google.colab import files
files.download('review_emotion_classifier.h5')

In [None]:
with open('index_dict.json', 'w') as f:
  json.dump(index_dict, f)
files.download('index_dict.json')

In [None]:
emo_model = load_model("review_emotion_classifier.h5")

In [None]:
test = ["我笑死了","死的很惨","你妈死了", "玩游戏不如跳舞","我擦了，肾没了！！！", "这个故事很悲催", "这故事没有特定主角"]
max = 0
x = []
for s in test:    
    sen = []

    words = jieba.lcut(s)
    if len(words) > max:
        max = len(words)

    for w in words:
        try:
            sen.append(index_dict[w])
        except:
            sen.append(0)
    x.append(sen)
x = sequence.pad_sequences(np.array(x), maxlen = 139)

In [None]:
emo_model.predict(x)

In [None]:
emo_model.save("emotion_classifier.h5")
files.download("emotion_classifier.h5")