In [1]:
import  warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, KFold, ShuffleSplit
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import torchvision
from torch import optim
from torch import cuda
from torch.optim.lr_scheduler import CosineAnnealingLR

import time, gc, random
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns

from stop_words import get_stop_words
import nltk, string
from nltk.stem.porter import PorterStemmer

import lightgbm as lgb

random.seed(2020)
np.random.seed(2020)
torch.cuda.manual_seed_all(2020)

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
from Preprocessing import make_tfidf_df
from one_target import Train_Predict as Onetarget_Train_Predict
from all_target import Train_Predict as Alltarget_Train_Predict

In [32]:
def main():
    path='../'
    train_df = pd.read_csv(path+'train_translated.csv').sample(400)
    test_df = pd.read_csv(path+'test_translated.csv')
    df = pd.concat([train_df, test_df],axis=0,ignore_index=True)
    df['text_id'] = df['id']
    df.drop(columns=['id'], inplace=True)
    k = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
    epoch_num=1
    
    languages = ['description','translate_de', 'translate_es', 'translate_fr', 'translate_ja']
    test_dic={}
    loss_dic={}
    for language in languages:
        test_dic[language]={}
        loss_dic[language]={}
        
        train_df, test_df, mlp_feature = make_tfidf_df(df, language)
        alltarget_trnpred = Alltarget_Train_Predict(train_df, test_df, mlp_feature, hidden_layers=[300, 100], lr=0.0006)
        all_test_df, all_trn_loss_list = alltarget_trnpred.predict_test_df(epoch_num)
        
        preds_cols = [col for col  in all_test_df.columns if 'p_' in col] 
        test_dic[language]['all'] = all_test_df[preds_cols].rename(columns={col:f'{language}_{col}' for col in preds_cols})
        loss_dic[language]['all'] = all_trn_loss_list
        
        onetarget_trnpred = Onetarget_Train_Predict(train_df, test_df, mlp_feature, hidden_layers=[300, 100], lr=0.0006)
        one_test_data=[]
        for label in [1,2,3,4]:
            one_test_df, one_trn_loss_list = onetarget_trnpred.predict_test_df(label, epoch_num)
            preds_cols = [col for col  in one_test_df.columns if 'p_' in col] 
            one_test_data.append(one_test_df[preds_cols].rename(columns={col:f'{language}_{col}' for col in preds_cols}))
            loss_dic[language][f'{label}'] = one_trn_loss_list
            
            
        test_dic[language]['one'] = pd.concat(one_test_data, axis=1)
    
    return test_dic, loss_dic

In [33]:
test_dic, loss_dic = main()

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [37]:
test_dic['description']['all']

Unnamed: 0,description_p_1_0,description_p_2_0,description_p_3_0,description_p_4_0
0,0.244496,0.255565,0.261080,0.238858
1,0.244159,0.255949,0.261043,0.238849
2,0.244618,0.255808,0.261189,0.238385
3,0.244628,0.255889,0.260789,0.238693
4,0.244677,0.255797,0.260837,0.238689
5,0.244386,0.255616,0.261373,0.238625
6,0.244739,0.255374,0.260983,0.238904
7,0.244327,0.255640,0.261436,0.238597
8,0.244659,0.255451,0.261170,0.238720
9,0.244510,0.255725,0.260980,0.238786


In [38]:
test_dic['description']['one']

Unnamed: 0,description_p_1_0,description_p_2_0,description_p_3_0,description_p_4_0
0,0.516021,0.480197,0.531566,0.508467
1,0.515684,0.480303,0.531698,0.509000
2,0.515776,0.480801,0.532319,0.509405
3,0.515636,0.480718,0.531761,0.508299
4,0.515525,0.480276,0.531729,0.508673
5,0.516062,0.480884,0.531741,0.508001
6,0.515823,0.479711,0.531600,0.508857
7,0.516256,0.481287,0.532184,0.508879
8,0.515530,0.479894,0.531687,0.508411
9,0.516231,0.480511,0.531873,0.509062
