In [1]:
from EvoMSA.base import EvoMSA
from microtc.utils import load_model, save_model
from itertools import combinations
import random
import numpy as np
from glob import glob
from joblib import Parallel, delayed
import joblib as joblib
import pandas as pd
from sklearn import metrics
import winsound

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', None)

winsound.Beep(3000, 900)


In [2]:
winsound.Beep(4500, 300)

In [3]:
stacked_method = "sklearn.svm.LinearSVC"

# 8 Models Pre-Trained EvoMSA
pre_models_dict = {
    "davincis22": [load_model('pre_models/davincis22_Es.evomsa'), stacked_method],
    "detoxis21_aggressiveness" : [load_model('pre_models/detoxis21_aggressiveness_Es.evomsa'), stacked_method],
    "exist21" : [load_model('pre_models/exist21_Es.evomsa'), stacked_method],
    "haha21" : [load_model('pre_models/haha21_Es.evomsa'), stacked_method],
    "meoffendes21" : [load_model('pre_models/meoffendes21_Es.evomsa'), stacked_method],
    "mexa3t18_aggress" : [load_model('pre_models/mexa3t18_aggress_Es.evomsa'), stacked_method],
    "misogyny_centrogeo" : [load_model('pre_models/misogyny_centrogeo_Es.evomsa'), stacked_method],
    "metwo22" : [load_model('pre_models/metwo22_Es.evomsa'), stacked_method]
}  

print(len(pre_models_dict.keys()), pre_models_dict.keys())

8 dict_keys(['davincis22', 'detoxis21_aggressiveness', 'exist21', 'haha21', 'meoffendes21', 'mexa3t18_aggress', 'misogyny_centrogeo', 'metwo22'])


In [4]:
combinaciones = []
items = pre_models_dict.keys()
for i in range(1, len(items)):
    combs = combinations(items, i+1)
    n_c = sum(1 for e in combs)
    print(i, n_c, combs)
    combs = combinations(items, i+1)
    for c in combs:
        #print(c)
        combinaciones.append(c)
len(combinaciones)     ### 247 combinaciones de 2 a 8 elementos

1 28 <itertools.combinations object at 0x00000167A99CE540>
2 56 <itertools.combinations object at 0x00000167A99CE540>
3 70 <itertools.combinations object at 0x00000167A99CE540>
4 56 <itertools.combinations object at 0x00000167A99CE540>
5 28 <itertools.combinations object at 0x00000167A99CE540>
6 8 <itertools.combinations object at 0x00000167A99CE540>
7 1 <itertools.combinations object at 0x00000167A99CE540>


247

In [5]:
# tomar random 60 
samples = random.sample(combinaciones, 53)

# agregar los single models para garantizar que existan    
for p in pre_models_dict:
    samples.append((p, np.nan))

# agegar conjunto vacio
samples.append(()) 

len(samples), samples

(62,
 [('davincis22',
   'meoffendes21',
   'mexa3t18_aggress',
   'misogyny_centrogeo',
   'metwo22'),
  ('davincis22', 'exist21', 'metwo22'),
  ('detoxis21_aggressiveness',
   'exist21',
   'meoffendes21',
   'misogyny_centrogeo'),
  ('davincis22',
   'detoxis21_aggressiveness',
   'haha21',
   'meoffendes21',
   'misogyny_centrogeo',
   'metwo22'),
  ('exist21', 'haha21', 'meoffendes21', 'misogyny_centrogeo'),
  ('davincis22', 'meoffendes21', 'mexa3t18_aggress', 'misogyny_centrogeo'),
  ('davincis22', 'detoxis21_aggressiveness', 'misogyny_centrogeo', 'metwo22'),
  ('detoxis21_aggressiveness', 'meoffendes21', 'misogyny_centrogeo'),
  ('davincis22', 'exist21', 'haha21', 'mexa3t18_aggress'),
  ('davincis22', 'exist21', 'meoffendes21', 'mexa3t18_aggress', 'metwo22'),
  ('davincis22',
   'detoxis21_aggressiveness',
   'haha21',
   'meoffendes21',
   'mexa3t18_aggress'),
  ('exist21', 'meoffendes21', 'misogyny_centrogeo', 'metwo22'),
  ('davincis22', 'metwo22'),
  ('detoxis21_aggressivene

In [6]:
df_samples =  pd.DataFrame(samples)
df_samples.to_csv('premodels_combinations_63.csv')

In [7]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1200
df_samples

Unnamed: 0,0,1,2,3,4,5
0,davincis22,meoffendes21,mexa3t18_aggress,misogyny_centrogeo,metwo22,
1,davincis22,exist21,metwo22,,,
2,detoxis21_aggressiveness,exist21,meoffendes21,misogyny_centrogeo,,
3,davincis22,detoxis21_aggressiveness,haha21,meoffendes21,misogyny_centrogeo,metwo22
4,exist21,haha21,meoffendes21,misogyny_centrogeo,,
5,davincis22,meoffendes21,mexa3t18_aggress,misogyny_centrogeo,,
6,davincis22,detoxis21_aggressiveness,misogyny_centrogeo,metwo22,,
7,detoxis21_aggressiveness,meoffendes21,misogyny_centrogeo,,,
8,davincis22,exist21,haha21,mexa3t18_aggress,,
9,davincis22,exist21,meoffendes21,mexa3t18_aggress,metwo22,


In [8]:
# 17 Datasets
for i, fn in enumerate(glob("../dataset_es/*_Es_train.json")):
    train_df = pd.read_json(fn, lines=True)
    test_df = pd.read_json(fn.replace("_train", "_test"), lines=True)
    ds_n = fn.replace("..\\dataset_es\\",'').replace("../dataset_es\\",'').replace('_Es_train.json','')
    print(i, ds_n, train_df.shape, test_df.shape)  

0 davincis22 (2521, 2) (841, 2)
1 detoxis21_aggressiveness (2770, 2) (693, 2)
2 detoxis21_insult (2770, 2) (693, 2)
3 detoxis21_intolerance (2770, 2) (693, 2)
4 detoxis21_mockery (2770, 2) (693, 2)
5 detoxis21_sarcasm (2770, 2) (693, 2)
6 detoxis21_stereotype (2770, 2) (693, 2)
7 detoxis21_toxicity (2770, 2) (693, 2)
8 exist21 (2655, 2) (886, 2)
9 haha21 (18000, 2) (6000, 2)
10 meoffendes21 (3795, 2) (1265, 2)
11 metwo22 (2880, 2) (720, 2)
12 mexa3t18_aggress (5389, 5) (2311, 5)
13 misoginia (2645, 5) (662, 5)
14 misogyny_centrogeo (5752, 2) (1439, 2)
15 semeval2018_anger (1359, 6) (627, 6)
16 semeval2018_sadness (1350, 6) (641, 6)


In [9]:
# 17 Datasets
d = []
for i, fn in enumerate(glob("../dataset_es/*_Es_train.json")):
    train_df = pd.read_json(fn, lines=True)
    test_df = pd.read_json(fn.replace("_train", "_test"), lines=True)
    ds_n = fn.replace("..\\dataset_es\\",'').replace("../dataset_es\\",'').replace('_Es_train.json','')
    d.append({'dataset': ds_n, 'train_shape': train_df.shape, 'test_shape':test_df.shape})
df2 = pd.DataFrame(d)
df2.to_csv('dataset_shapes_17.csv')
df2

Unnamed: 0,dataset,train_shape,test_shape
0,davincis22,"(2521, 2)","(841, 2)"
1,detoxis21_aggressiveness,"(2770, 2)","(693, 2)"
2,detoxis21_insult,"(2770, 2)","(693, 2)"
3,detoxis21_intolerance,"(2770, 2)","(693, 2)"
4,detoxis21_mockery,"(2770, 2)","(693, 2)"
5,detoxis21_sarcasm,"(2770, 2)","(693, 2)"
6,detoxis21_stereotype,"(2770, 2)","(693, 2)"
7,detoxis21_toxicity,"(2770, 2)","(693, 2)"
8,exist21,"(2655, 2)","(886, 2)"
9,haha21,"(18000, 2)","(6000, 2)"


---