In [207]:
!ls ..

!echo "------------------------------------"

!ls ../logs

!echo "------------------------------------"

!ls ../datasets/twitter/intermediate

bert_runner.py	datasets      notebooks		 statistical_test.py
configurations	logs	      pretrained-models  statistical_tests
datahandlers	ml_runner.py  process.py	 utils
datahelpers	models	      README.md
------------------------------------
 __init__.py			      twitter-evaluation-roberta.json   v1
'twitter-evaluation-(5-CV)-ml.json'   twitter-history-roberta.json
 twitter-evaluation-ml.json	      twitter_stats.json
------------------------------------
twitter_test.csv  twitter_train.csv  twitter_val.csv


In [208]:
import __init__
import json
import re
import random
import pandas as pd
from datahandlers import DataReader
from sklearn.feature_extraction.text import TfidfVectorizer
random.seed(10)

tfidf = TfidfVectorizer(ngram_range=(2,4), sublinear_tf=True, analyzer='char')

def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ", tweet.lower()).split())

In [215]:
val = DataReader.load_csv("../datasets/twitter/intermediate/twitter_val.csv")
val['processed_tweets'] = val['tweet'].apply(process_tweet)


test = DataReader.load_csv("../datasets/twitter/intermediate/twitter_test.csv")
test['processed_tweets'] = test['tweet'].apply(process_tweet)

train = DataReader.load_csv("../datasets/twitter/intermediate/twitter_train.csv")
train['processed_tweets'] = train['tweet'].apply(process_tweet)

display(test.head(2))

NgramLSVM_predicts = DataReader.load_json("../logs/twitter-evaluation-ml.json")
RoBERTa_predicts = DataReader.load_json("../logs/twitter-evaluation-roberta.json")

print(f"RoBERTa_predicts keys:{RoBERTa_predicts.keys()}")

Unnamed: 0,tweet,label,processed_tweets
0,@user âmy mom says my smile is captivatingâ...,0,my mom says my smile is captivating i says hap...
1,"in 3 days i will be meeting my sis-n-law, coun...",0,in 3 days i will be meeting my sis n law coune...


RoBERTa_predicts keys:dict_keys(['Test-F1 Macro', 'Test-accuracy', 'Test-classification-report', 'Test-gt', 'Test-predict', 'Val-F1 Macro', 'Val-accuracy', 'Val-classification-report', 'Val-gt', 'Val-predict'])


In [202]:
gold = NgramLSVM_predicts['Test-gt']

clf1_pred_eval = [1 if gt == NgramLSVM_predicts['Test-predict'][i] else 0 for i, gt in enumerate(gold)]

clf2_pred_eval = [1 if gt == RoBERTa_predicts['Test-predict'][i] else 0 for i, gt in enumerate(gold)]

ngramlsvm_correct_roberta_wrong_ids = [i for i, _ in enumerate(clf1_pred_eval)
                                      if clf1_pred_eval[i] == 1 and clf2_pred_eval[i] == 0]

roberta_correct_ngramlsvm_wrong_ids = [i for i, _ in enumerate(clf1_pred_eval)
                                      if clf1_pred_eval[i] == 0 and clf2_pred_eval[i] == 1]

both_models_predict_wrong = [i for i, _ in enumerate(clf1_pred_eval)
                            if clf1_pred_eval[i] == 0 and clf2_pred_eval[i] == 0]

both_models_predict_correctly = [i for i, _ in enumerate(clf1_pred_eval)
                            if clf1_pred_eval[i] == 1 and clf2_pred_eval[i] == 1]


print(f"# of samples NgramLSVM predict correctly but RoBERTa predicts wrongly:"
      f"{len(ngramlsvm_correct_roberta_wrong_ids)}\n")

print(f"# of samples RoBERTa predict correctly but NgramLSVM predicts wrongly:"
      f"{len(roberta_correct_ngramlsvm_wrong_ids)}\n")

print(f"# both models predict wrongly:"
      f"{len(both_models_predict_wrong)}\n")

print(f"# both models predict correctly:"
      f"{len(both_models_predict_correctly)}")

# of samples NgramLSVM predict correctly but RoBERTa predicts wrongly:35

# of samples RoBERTa predict correctly but NgramLSVM predicts wrongly:166

# both models predict wrongly:58

# both models predict correctly:6134


In [225]:
# picking 10 first samples from each category for both classes for investigations
sample_no = 10

def get_random_samples(ids, sample_no):
    sample = random.sample(range(0, len(ids)), k=sample_no)
    #print(len(list(set(sample))))
    return [ids[i] for i in sample if ids[i] < 4000]

def get_samples(data, ids, sample_no):
    select = get_random_samples(ids, sample_no)
    print(select)
#    select = [i for i in select]
    
    #[random.choice(ids) for i in range(sample_no)]
    df_data = {
        "tweet":[data['tweet'].tolist()[i] for i in select],
        "label":[data['label'].tolist()[i] for i in select],
        "processed_tweets":[data['processed_tweets'].tolist()[i] 
                            for i in select]
    }
    return pd.DataFrame(df_data)

ngramlsvm_correct_roberta_wrong_samples = get_samples(test, ngramlsvm_correct_roberta_wrong_ids, sample_no)
roberta_correct_ngramlsvm_wrong_samples = get_samples(test, roberta_correct_ngramlsvm_wrong_ids, sample_no)
both_models_predict_wrong_samples = get_samples(test, both_models_predict_wrong, sample_no)
both_models_predict_correctly_samples = get_samples(test, both_models_predict_correctly, sample_no)

[1783, 452, 3971, 1464, 1985, 121, 2307]
[1768, 3624, 3973, 3023, 2824, 2219, 3184]
[1836, 760, 2969, 1072, 2819, 1873, 516]
[35, 2003, 1143, 1660, 2581, 3122, 2044, 2682]


In [226]:
test.shape

(4795, 3)

In [227]:
both_models_predict_correctly_samples.head(10)

Unnamed: 0,tweet,label,processed_tweets
0,#smallscale #woodland #beekeeping. #bees are ...,0,smallscale woodland beekeeping bees are in the...
1,this will be an amazingly talkshow this thursd...,0,this will be an amazingly talkshow this thursd...
2,happy father's day ! #belgrade #guinea-bissau,0,happy father s day belgrade guinea bissau
3,"@user i had a dream we went skiing, but we ne...",0,i had a dream we went skiing but we never made...
4,"one day, i'll go to a nude beach ðð´ð¾â...",0,one day i ll go to a nude beach hippie life
5,"@user i'm surprised they didn't have a gun, th...",0,i m surprised they didn t have a gun they usua...
6,depressed blvck princ3 âï¸ðâ¹ï¸ð #c...,0,depressed blvck princ3 clubkid dpressed blvck ...
7,**new 190k gym** coming 14th june to your pola...,0,new 190k gym coming 14th june to your poland c...


In [142]:
#tfidf.fit(test[''])

In [131]:

# Output [60, 20, 60]


In [205]:
test = DataReader.load_csv("../datasets/twitter/intermediate/twitter_test.csv")


train = DataReader.load_csv("../datasets/twitter/intermediate/twitter_train.csv")
test.shape, train.shape

((4795, 2), (23091, 2))

In [206]:
len(gold)

6393