In [1]:
import pandas as pd
from glob import glob
from microtc.utils import load_model
from EvoMSA.base import EvoMSA
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import numpy as np
from joblib import Parallel, delayed
import joblib as joblib
from timeit import default_timer as timer

import warnings
warnings.filterwarnings("ignore")

import winsound
winsound.Beep(3000, 900)


pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1200

stacked_method = "sklearn.svm.LinearSVC"

# 9 Models Pre-Trained EvoMSA
pre_models_dict = {
    "davincis22": [load_model('pre_models/davincis22_Es.evomsa'), stacked_method],
    "detoxis21_aggressiveness" : [load_model('pre_models/detoxis21_aggressiveness_Es.evomsa'), stacked_method],
    "exist21" : [load_model('pre_models/exist21_Es.evomsa'), stacked_method],
    "haha21" : [load_model('pre_models/haha21_Es.evomsa'), stacked_method],
    "meoffendes21" : [load_model('pre_models/meoffendes21_Es.evomsa'), stacked_method],
    "mexa3t18_aggress" : [load_model('pre_models/mexa3t18_aggress_Es.evomsa'), stacked_method],
    "misogyny_centrogeo" : [load_model('pre_models/misogyny_centrogeo_Es.evomsa'), stacked_method],
    "metwo22" : [load_model('pre_models/metwo22_Es.evomsa'), stacked_method],
    "haha18" : [load_model('pre_models/haha18_Es.evomsa'), stacked_method]
}  

print(len(pre_models_dict.keys()), pre_models_dict.keys())

df_samples = pd.read_csv('premodels_combinations_50.csv')
df_samples.rename(columns = {df_samples.columns[0]:'Idx'}, inplace = True)
df_samples.drop('Idx', inplace=True, axis=1)
#df_samples.fillna("", inplace=True)
df_samples

9 dict_keys(['davincis22', 'detoxis21_aggressiveness', 'exist21', 'haha21', 'meoffendes21', 'mexa3t18_aggress', 'misogyny_centrogeo', 'metwo22', 'haha18'])


Unnamed: 0,0,1,2,3,4,5,6
0,detoxis21_aggressiveness,haha21,mexa3t18_aggress,metwo22,haha18,,
1,davincis22,meoffendes21,mexa3t18_aggress,haha18,,,
2,detoxis21_aggressiveness,haha21,meoffendes21,misogyny_centrogeo,,,
3,detoxis21_aggressiveness,exist21,haha21,metwo22,,,
4,davincis22,detoxis21_aggressiveness,exist21,haha21,meoffendes21,mexa3t18_aggress,metwo22
5,davincis22,detoxis21_aggressiveness,exist21,haha21,meoffendes21,,
6,davincis22,exist21,haha21,mexa3t18_aggress,metwo22,haha18,
7,davincis22,meoffendes21,mexa3t18_aggress,misogyny_centrogeo,haha18,,
8,davincis22,exist21,haha21,mexa3t18_aggress,misogyny_centrogeo,haha18,
9,davincis22,exist21,meoffendes21,,,,


In [2]:
def process(fn):

    train_df = pd.read_json(fn, lines=True)
    test_df = pd.read_json(fn.replace("_train", "_test"), lines=True)

    # recortarlos
    #train_df = train_df.loc[0:89, ['text', 'klass']]
    #test_df = test_df.loc[0:16, ['text', 'klass']]

    X_train, y_train = train_df['text'], train_df['klass']
    X_test,  y_test  = test_df['text'],  test_df['klass']
       
    ds_n = fn.replace("dataset_es\\",'').replace("dataset_es/",'').replace('_Es_train.json','')

    print(ds_n, train_df.shape, test_df.shape)

    X_F, y_F = train_df['text'], train_df['klass']  # X, y  Folds
   
    for i, sample in df_samples.iterrows():   ### [:3] [3:4]
        try:
            # remueve el pre-model (de la combinacion) si coincide con el dataset en evaluacion
            sample_avoid = sample.dropna().values[sample.dropna().values != ds_n]
            # toma los objetos de los pre-modelos
            pre_models = [pre_models_dict[key] for key in sample_avoid]  
           
            ### train & test ###
            evo = EvoMSA(TR=True, B4MSA=True, lang='es', Emo=True, HA=True, stacked_method=stacked_method, models = pre_models)                
            evo.fit(X_train, y_train)                
            pred = evo.predict(X_test)
            recall_score = metrics.recall_score(y_test, pred, average="macro")
            f1_score = metrics.f1_score(y_test, pred, average="macro")

            winsound.Beep(4500, 300)
           
            ### K-Fold / train ###
            scores_kfold = []
            skf = StratifiedKFold(n_splits=5)
            for train_index, test_index in skf.split(X_F, y_F):
                X_train_F, X_test_F = X_F[train_index], X_F[test_index]
                y_train_F, y_test_F = y_F[train_index], y_F[test_index]    

                evo = EvoMSA(TR=True, B4MSA=True, lang='es', Emo=True, HA=True, stacked_method=stacked_method, models = pre_models)                
                evo.fit(X_train_F, y_train_F)                
                pred = evo.predict(X_test_F)
                recall_score = metrics.recall_score(y_test_F, pred, average="macro")
                f1_score_F = metrics.f1_score(y_test_F, pred, average="macro")

                scores_kfold.append(f1_score_F)

                winsound.Beep(4500, 300)
               
               
            #scores = ', '.join([str(s) for s in scores_kfold])                
            _ = [ds_n, i, train_df.shape, test_df.shape, f1_score, recall_score, 
                 sample_avoid.tolist(),
                 i, train_df.shape, np.mean(scores_kfold, axis=0), np.std(scores_kfold, axis=0), np.min(scores_kfold), np.max(scores_kfold), scores_kfold]
                     
            with open('debug_build.txt', 'a', encoding='utf-8') as the_file:
                txt = ', '.join([str(r) for r in _]) 
                the_file.write(txt)
                the_file.write('\n')
            print(_)
           
            results.append(_)

        except Exception as ex:
            winsound.Beep(500, 100)
            winsound.Beep(500, 100)
            winsound.Beep(500, 100)
            winsound.Beep(500, 100)
            print('>>', ex)
            with open('debug_build.txt', 'a', encoding='utf-8') as the_file:
                the_file.write('\n')
                the_file.write('=======================================================\n')
                the_file.write('Exception: ' + ds_n + ' > ' + str(i) + ' > ' + '\n')
                the_file.write(ex + '\n')
        finally:
            winsound.Beep(3000, 900)

    return results

In [3]:
if False:
    results = []
    fnames = glob("dataset_es/meoff*_train.json") ## todos los train datasets
    fnames.sort()
    for fn in fnames:
        print(fn)
        process(fn)

In [1]:
fnames = glob("dataset_es/m*_train.json") ## todos los train datasets
fnames.sort()
results = []
parallel_pool = Parallel(n_jobs=7)
delayed_funcs = [delayed(process)(fn) for fn in fnames]

NameError: name 'glob' is not defined

In [5]:
start = timer()
results = parallel_pool(delayed_funcs)
end = timer()
print(end - start)

winsound.Beep(500, 900)

924.2143143999999


In [6]:
clean_results = []
for r in results:
    for a in r:
        clean_results.append(a)

results_df = pd.DataFrame(data=clean_results, columns=['dataset', 'combina_1', 'train.shape', 'test.shape', 'f1', 'recall', 
                                                       'combinations', 'combina_2', 'train.shape', 'mean', 'std', 'min', 'max', 'scores_kfold' ])
results_df.to_csv('results_full_03.csv')
#results_df

print('--- done ---')

--- done ---
