In [1]:
run = 'stsb-xlm-r-multilingual-64'

In [2]:
import pandas as pd
import numpy as np
import joblib
from tqdm.auto import tqdm
import json
import glob
import os
from io import StringIO 
import sys
import torch

class Capturing(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio
        sys.stdout = self._stdout

if '.gitignore' not in os.listdir():
    os.chdir('..')
    
from src.utils.emb_clf_setup_utils import clean_for_content, test_model

In [3]:
clf = torch.load('models/clf.pkl')
emb = torch.load('models/emb.pkl')

In [4]:
def get_test_set():
    
    df = pd.read_csv('data/labeled_data/training_1600000_processed_noemoticon.csv', encoding='latin', header=None, usecols=[0,5])
    df.columns = ['label', 'text']
    df['label'] = [0 if x==0 else 1 for x in df['label']]
    df['lang'] = 'en'
    df['text'] = [clean_for_content(text, lang) for text, lang in zip(df['text'], df['lang'])]
    df = df[df['text']!=''].reset_index(drop=True)

    embeddings = np.load('data/labeled_data/embeddings.npy')

    with open('data/labeled_data/test_ids.txt', 'r') as fp:
        test_ids = json.load(fp)
        
    test_df = df.loc[test_ids,:]    
    test_embeddings = embeddings[test_ids,:]
    
    return test_df, test_embeddings

In [5]:
def get_alt_english():
    df = pd.concat([
        pd.read_csv('data/labeled_data/validation/test.csv'),
        pd.read_csv('data/labeled_data/validation/train.csv')
    ])
    df = df[df['text'].notnull()].reset_index(drop=True)
    del df['selected_text']
    df['lang'] = 'en'
    df.loc[df['sentiment']=='neutral', 'label'] = 0.5
    df.loc[df['sentiment']=='negative', 'label'] = 0
    df.loc[df['sentiment']=='positive', 'label'] = 1
    df['text'] = [clean_for_content(text, lang) for text, lang in zip(df['text'], df['lang'])]
    subset = df[df['label'].isin([1, 0])].reset_index(drop=True)
    subset['label'] = subset['label'].astype(int)
    return subset

In [6]:
def get_alt_portuguese():
    
    df = pd.read_csv('data/labeled_data/validation/TweetSentBR.txt', sep='\t')
    df.columns = ['text', 'label']
    df['lang'] = 'pt'
    df = df[df['label'].isin([-1,1])].reset_index(drop=True)
    df.loc[df['label']==-1, 'label'] = 0
    df['text'] = [clean_for_content(text, lang) for text, lang in zip(df['text'], df['lang'])]
    df = df[df['text']!=''].reset_index(drop=True)
    
    return df

In [7]:
def get_multilingual():
    langs = glob.glob('data/labeled_data/validation/multilingual_twitter_sentiment_*.tsv')
    langs = [lang.replace("data/labeled_data/validation/multilingual_twitter_sentiment_", "") for lang in langs]
    langs = [lang.replace(".tsv", "") for lang in langs]
    langs.sort()

    validation = {}

    for lang in langs:
        df = pd.read_csv(
            'data/labeled_data/validation/multilingual_twitter_sentiment_{}.tsv'.format(lang), 
            sep='\t'
        )
        df.rename(columns={'tweet_full': 'text'}, inplace=True)
        df = df[df['label'].isin(['Positive', 'Negative'])]
        df['label'] = [0 if x=='Negative' else 1 for x in df['label']]
        df['text'] = [clean_for_content(text, lang) for text, lang in zip(df['text'], df['lang'])]
        df = df[df['text']!=''].reset_index(drop=True)

        validation[lang] = df
    return validation

In [8]:
test_df, test_embeddings = get_test_set()
alt_english = get_alt_english()
alt_portuguese = get_alt_portuguese()
multilingual = get_multilingual()

In [10]:
with Capturing() as output:
    print("\n----------\n\nTest Set")
    test_model(clf, test_df, test_embeddings)

    print("\n----------\n\nAlternative English")
    embeddings = emb.encode(alt_english['text'].values)
    test_model(clf, alt_english, embeddings)

    print("\n----------\n\nAlternative Portuguese")
    embeddings = emb.encode(alt_portuguese['text'].values)
    test_model(clf, alt_portuguese, embeddings)

    print("\n----------\n\nMultilingual")
    for lang in multilingual.keys():
        print("\n", lang.upper())
        embeddings = emb.encode(multilingual[lang]['text'].values)
        test_model(clf, multilingual[lang], embeddings)

with open("models/{}.txt".format(run), "w") as f:
    for line in output:
        print(line)
        f.write(line)
        f.write('\n')


----------

Test Set
Testing model...
Got 253188 out of 320000 correct.
Accuracy rate is 0.7912125
Precision is 0.7871810367154644, Recall is 0.7980669442464708

----------

Alternative English
Testing model...
Got 16092 out of 18467 correct.
Accuracy rate is 0.871392213136947
Precision is 0.9122490412812994, Recall is 0.8351058337635519

----------

Alternative Portuguese
Testing model...
Got 8310 out of 11074 correct.
Accuracy rate is 0.7504063572331587
Precision is 0.7718365061590146, Recall is 0.8294223826714802

----------

Multilingual

 ALBANIAN
Testing model...
Got 1349 out of 1866 correct.
Accuracy rate is 0.7229367631296891
Precision is 0.8809338521400778, Recall is 0.7566844919786097

 BOSNIAN
Testing model...
Got 1477 out of 1872 correct.
Accuracy rate is 0.7889957264957265
Precision is 0.781387181738367, Recall is 0.859073359073359

 BULGARIAN
Testing model...
Got 797 out of 1101 correct.
Accuracy rate is 0.7238873751135332
Precision is 0.7008426966292135, Recall is 0.845

# Sandbox