In [1]:
from EvoMSA.base import EvoMSA
from microtc.utils import load_model, save_model
from itertools import combinations
import random
import numpy as np
from glob import glob
from joblib import Parallel, delayed
import joblib as joblib
import pandas as pd
from sklearn import metrics
import winsound

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', None)

winsound.Beep(3000, 900)


In [2]:
winsound.Beep(4500, 300)

In [3]:
stacked_method = "sklearn.svm.LinearSVC"

# 8 Models Pre-Trained EvoMSA
pre_models_dict = {
    "davincis22": [load_model('pre_models/davincis22_Es.evomsa'), stacked_method],
    "detoxis21_aggressiveness" : [load_model('pre_models/detoxis21_aggressiveness_Es.evomsa'), stacked_method],
    "exist21" : [load_model('pre_models/exist21_Es.evomsa'), stacked_method],
    "haha21" : [load_model('pre_models/haha21_Es.evomsa'), stacked_method],
    "meoffendes21" : [load_model('pre_models/meoffendes21_Es.evomsa'), stacked_method],
    "mexa3t18_aggress" : [load_model('pre_models/mexa3t18_aggress_Es.evomsa'), stacked_method],
    "misogyny_centrogeo" : [load_model('pre_models/misogyny_centrogeo_Es.evomsa'), stacked_method],
    "metwo22" : [load_model('pre_models/metwo22_Es.evomsa'), stacked_method]
}  

print(len(pre_models_dict.keys()), pre_models_dict.keys())

8 dict_keys(['davincis22', 'detoxis21_aggressiveness', 'exist21', 'haha21', 'meoffendes21', 'mexa3t18_aggress', 'misogyny_centrogeo', 'metwo22'])


In [4]:
combinaciones = []
items = pre_models_dict.keys()
for i in range(1, len(items)):
    combs = combinations(items, i+1)
    n_c = sum(1 for e in combs)
    print(i, n_c, combs)
    combs = combinations(items, i+1)
    for c in combs:
        #print(c)
        combinaciones.append(c)
len(combinaciones)     ### 247 combinaciones de 2 a 8 elementos

1 28 <itertools.combinations object at 0x000002903EBCB770>
2 56 <itertools.combinations object at 0x000002903EBCB770>
3 70 <itertools.combinations object at 0x000002903EBCB770>
4 56 <itertools.combinations object at 0x000002903EBCB770>
5 28 <itertools.combinations object at 0x000002903EBCB770>
6 8 <itertools.combinations object at 0x000002903EBCB770>
7 1 <itertools.combinations object at 0x000002903EBCB770>


247

In [5]:
# tomar random 60 
samples = random.sample(combinaciones, 80)

# agregar los single models para garantizar que existan    
for p in pre_models_dict:
    samples.append((p, np.nan))

# agegar conjunto vacio
samples.append(()) 

len(samples), samples

(89,
 [('meoffendes21', 'mexa3t18_aggress'),
  ('davincis22',
   'haha21',
   'mexa3t18_aggress',
   'misogyny_centrogeo',
   'metwo22'),
  ('exist21', 'haha21', 'meoffendes21', 'mexa3t18_aggress', 'metwo22'),
  ('haha21', 'metwo22'),
  ('davincis22', 'detoxis21_aggressiveness', 'mexa3t18_aggress'),
  ('exist21', 'haha21', 'mexa3t18_aggress', 'misogyny_centrogeo'),
  ('mexa3t18_aggress', 'misogyny_centrogeo', 'metwo22'),
  ('davincis22', 'exist21', 'haha21', 'mexa3t18_aggress'),
  ('davincis22', 'haha21', 'meoffendes21', 'mexa3t18_aggress', 'metwo22'),
  ('exist21', 'meoffendes21', 'misogyny_centrogeo', 'metwo22'),
  ('detoxis21_aggressiveness',
   'exist21',
   'meoffendes21',
   'misogyny_centrogeo',
   'metwo22'),
  ('davincis22',
   'detoxis21_aggressiveness',
   'meoffendes21',
   'misogyny_centrogeo',
   'metwo22'),
  ('davincis22',
   'detoxis21_aggressiveness',
   'meoffendes21',
   'mexa3t18_aggress',
   'metwo22'),
  ('detoxis21_aggressiveness',
   'haha21',
   'meoffendes21'

In [6]:
df_samples =  pd.DataFrame(samples)
df_samples.to_csv('premodels_combinations_89.csv')

In [7]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1200
df_samples

Unnamed: 0,0,1,2,3,4,5,6
0,meoffendes21,mexa3t18_aggress,,,,,
1,davincis22,haha21,mexa3t18_aggress,misogyny_centrogeo,metwo22,,
2,exist21,haha21,meoffendes21,mexa3t18_aggress,metwo22,,
3,haha21,metwo22,,,,,
4,davincis22,detoxis21_aggressiveness,mexa3t18_aggress,,,,
5,exist21,haha21,mexa3t18_aggress,misogyny_centrogeo,,,
6,mexa3t18_aggress,misogyny_centrogeo,metwo22,,,,
7,davincis22,exist21,haha21,mexa3t18_aggress,,,
8,davincis22,haha21,meoffendes21,mexa3t18_aggress,metwo22,,
9,exist21,meoffendes21,misogyny_centrogeo,metwo22,,,


In [8]:
# 17 Datasets
for i, fn in enumerate(glob("dataset_es/*_Es_train.json")):
    train_df = pd.read_json(fn, lines=True)
    test_df = pd.read_json(fn.replace("_train", "_test"), lines=True)
    ds_n = fn.replace("dataset_es\\",'').replace('_Es_train.json','')
    print(i, ds_n, train_df.shape, test_df.shape)  

0 davincis22 (2521, 2) (841, 2)
1 detoxis21_aggressiveness (2770, 2) (693, 2)
2 detoxis21_insult (2770, 2) (693, 2)
3 detoxis21_intolerance (2770, 2) (693, 2)
4 detoxis21_mockery (2770, 2) (693, 2)
5 detoxis21_sarcasm (2770, 2) (693, 2)
6 detoxis21_stereotype (2770, 2) (693, 2)
7 detoxis21_toxicity (2770, 2) (693, 2)
8 exist21 (2655, 2) (886, 2)
9 haha21 (18000, 2) (6000, 2)
10 meoffendes21 (3795, 2) (1265, 2)
11 metwo22 (2880, 2) (720, 2)
12 mexa3t18_aggress (5389, 5) (2311, 5)
13 misoginia (2645, 5) (662, 5)
14 misogyny_centrogeo (5752, 2) (1439, 2)
15 semeval2018_anger (1359, 6) (627, 6)
16 semeval2018_sadness (1350, 6) (641, 6)


In [9]:
# 17 Datasets
d = []
for i, fn in enumerate(glob("dataset_es/*_Es_train.json")):
    train_df = pd.read_json(fn, lines=True)
    test_df = pd.read_json(fn.replace("_train", "_test"), lines=True)
    ds_n = fn.replace("dataset_es\\",'').replace('_Es_train.json','')
    d.append({'dataset': ds_n, 'train_shape': train_df.shape, 'test_shape':test_df.shape})
df2 = pd.DataFrame(d)
df2.to_csv('dataset_shapes_17.csv')
df2

Unnamed: 0,dataset,train_shape,test_shape
0,davincis22,"(2521, 2)","(841, 2)"
1,detoxis21_aggressiveness,"(2770, 2)","(693, 2)"
2,detoxis21_insult,"(2770, 2)","(693, 2)"
3,detoxis21_intolerance,"(2770, 2)","(693, 2)"
4,detoxis21_mockery,"(2770, 2)","(693, 2)"
5,detoxis21_sarcasm,"(2770, 2)","(693, 2)"
6,detoxis21_stereotype,"(2770, 2)","(693, 2)"
7,detoxis21_toxicity,"(2770, 2)","(693, 2)"
8,exist21,"(2655, 2)","(886, 2)"
9,haha21,"(18000, 2)","(6000, 2)"


---

In [13]:
print(pre_models_dict)
fnames = glob("dataset_es/meoffendes21_Es_train.json") ## todos los train datasets
fnames.sort()
for fn in fnames:
    print(fn)
    for i, sample in df_samples[:2].iterrows():
        ds_n = fn.replace("dataset_es\\",'').replace("dataset_es/",'').replace('_Es_train.json','')
        sample_avoid = sample.dropna().values[sample.dropna().values != ds_n]
        print(sample.dropna().values, sample_avoid)

{'davincis22': [<EvoMSA.base.EvoMSA object at 0x0000022E90513E80>, 'sklearn.svm.LinearSVC'], 'detoxis21_aggressiveness': [<EvoMSA.base.EvoMSA object at 0x0000022E905371C0>, 'sklearn.svm.LinearSVC'], 'exist21': [<EvoMSA.base.EvoMSA object at 0x0000022E90537CA0>, 'sklearn.svm.LinearSVC'], 'haha21': [<EvoMSA.base.EvoMSA object at 0x0000022E905374C0>, 'sklearn.svm.LinearSVC'], 'meoffendes21': [<EvoMSA.base.EvoMSA object at 0x0000022E90537280>, 'sklearn.svm.LinearSVC'], 'mexa3t18_aggress': [<EvoMSA.base.EvoMSA object at 0x0000022E90537670>, 'sklearn.svm.LinearSVC'], 'misogyny_centrogeo': [<EvoMSA.base.EvoMSA object at 0x0000022E9587E190>, 'sklearn.svm.LinearSVC'], 'metwo22': [<EvoMSA.base.EvoMSA object at 0x0000022E9587E340>, 'sklearn.svm.LinearSVC']}
dataset_es/meoffendes21_Es_train.json
['haha21' 'meoffendes21'] ['haha21']
['exist21' 'misogyny_centrogeo' 'metwo22'] ['exist21' 'misogyny_centrogeo' 'metwo22']


In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1200
df1 = pd.read_csv('results_kfold.csv')
df2 = pd.read_csv('results_build_table_mms.csv')

df = pd.concat([df1, df2], ignore_index=True)
print(df1.shape, df2.shape, df.shape)

In [None]:
#df['dataset'] = df['dataset'].str.replace(r'dataset_es\\', '')
#df
# df = df.drop(df.columns[0], axis=1)
#df.rename(columns = {df.columns[0]:'Idx'}, inplace = True)
#df.drop('Idx', inplace=True, axis=1)
#df.fillna("", inplace=True)
df

In [None]:
df.to_csv('results_kfolds_mms.csv')

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1200
df1 = pd.read_csv('results_full_54_keep.csv')
df2 = pd.read_csv('results_full_56_keep.csv')
df3 = pd.read_csv('results_full_57_keep.csv')
df4 = pd.read_csv('results_full_58_keep.csv')
df5 = pd.read_csv('results_full_59_keep.csv')
df6 = pd.read_csv('results_full_60_keep.csv')
df7 = pd.read_csv('results_full_61_keep.csv')
df8 = pd.read_csv('results_full_62_keep.csv')
df9 = pd.read_csv('results_full_63_keep.csv')
df10 = pd.read_csv('results_full_64_keep.csv')
df11 = pd.read_csv('results_full_65_keep.csv')
df12 = pd.read_csv('results_full_66_keep.csv')


df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12], ignore_index=True)
print(df1.shape, df2.shape, df.shape)

In [None]:
df.to_csv('results_full_17.csv')

In [None]:
df