In [8]:
import os , sys
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from stop_words import get_stop_words
import nltk, string
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier


import lightgbm as lgb

from tqdm import tqdm_notebook as tqdm

import random

import optuna

random.seed(2020)
np.random.seed(2020)

warnings.filterwarnings('ignore')
%matplotlib inline

In [9]:
class Preprocessing:
    def __init__(self):
        self.porter = PorterStemmer()
        self.stop_words = get_stop_words('en')
        self.stop_words.append(' ')
        self.stop_words.append('')
    
    def pipeline(self, df):
        for lang in ['description']:
            #, 'translate_es', 'translate_fr', 'translate_de', 'translate_ja']:
            df[lang] = df[lang].apply(lambda x: self.change_text(x))
        return df

    def change_text(self, text):
        text = text.lower()
        text = text.replace('ml', 'machine learning')
        text = text.replace('machine learning', 'machinelearning')
        text = "".join([char if char not in string.punctuation else ' ' for char in text])
        text = " ".join([self.porter.stem(char) for char in text.split(' ') if char not in self.stop_words])
        return text
    
    def vectorize_tfidf(self, df):
        vec_tfidf = TfidfVectorizer()
        X = vec_tfidf.fit_transform(df.description.values)
        X = pd.DataFrame(X.toarray(), columns=vec_tfidf.get_feature_names())
        return X
    
    def vectorize_cnt(self, df):
        vec_cnt = CountVectorizer()
        X = vec_cnt.fit_transform(df.description.values)
        X = pd.DataFrame(X.toarray(), columns=vec_cnt.get_feature_names())
        return X


class Optimize_by_Optuna:
    def __init__(self, data, features, target_colname, target_name_2=None, _objective=None):
        self.data = data
        self.features = features
        self.target = target_colname
        if not target_colname:
            self.target_2 = target_colname
        else:
            self.target_2 = target_name_2
        self._objective = _objective
        
    
    def make_score(self, y, preds):
        s_1=1 - metrics.accuracy_score(y, preds)
        s_2=list(self.model.best_score['valid_1'].values())[0]

        return (s_1+s_2)/2

    def objective(self, trial):
                        
        PARAMS = {#'boosting_type': 'gbdt', 'metric': 'rmse',
            'boosting_type': 'gbdt',
            
            #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
            
            'objective': 'tweedie','metric': 'tweedie',
            
            'n_estimators': 1400,
            'boost_from_average': False,'verbose': -1,'random_state':2020,
        

            'tweedie_variance_power': trial.suggest_uniform('tweedie_variance_power', 1.01, 1.8),


            'max_bin': trial.suggest_int('max_bin', 50, 300),
            'subsample': trial.suggest_uniform('subsample', 0.4, 0.9),
            'subsample_freq': trial.suggest_uniform('subsample_freq', 0.4, 0.9),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.03, 0.5),
            'num_leaves': trial.suggest_int('num_leaves', 4, 2*5),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.0001, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.0001, 10.0),
        }
        
        score = 0
        k = StratifiedKFold(n_splits=5)
        for trn, val in k.split(self.data, self.data[self.target_2]):
            train_df = self.data.iloc[trn,:]
            val_df = self.data.iloc[val,:]
            train_set= lgb.Dataset(train_df[self.features],  train_df[self.target])
            val_set = lgb.Dataset(val_df[self.features],  val_df[self.target])   
            
            self.model = lgb.train(
                train_set=train_set, valid_sets=[train_set, val_set], params=PARAMS, num_boost_round=3000, 
                early_stopping_rounds=200, verbose_eval=500
                )
                
            preds = self.model.predict(val_df[self.features])
            preds = np.round(preds)
            y = val_df[self.target]
            s = self.make_score(y, preds)
            score+=s/5
            
        return score


class Null_Importance:
    def __init__(self, train_X, train_y, PARAMS, y_2=None):
        self.train_X = train_X
        self.train_y = train_y
        self.y_2= y_2
        self.PARAMS = PARAMS

    def make_null_importance_df(self):
        null_importance=pd.DataFrame()
        null_importance['col'] = self.train_X.columns.tolist()
        try:
            for i in range(50):
                tmp_null_importance=[]
                
                _train_y = self.train_y.apply(lambda x: random.choice([0,1]))
                _train_y_2 = self.y_2.sample(frac=1).values
                
                print(f"""
                
                Train Null Importance   {i+1}
                
                """ )
                k = StratifiedKFold(n_splits=5)
                for trn, val in k.split(self.train_X, _train_y_2):
                    trn_X, val_X = self.train_X.iloc[trn,:], self.train_X.iloc[val,:]
                    trn_y, val_y = _train_y.iloc[trn].astype(int), _train_y.iloc[val].astype(int)
                    train_set = lgb.Dataset(trn_X, trn_y)
                    val_set = lgb.Dataset(val_X, val_y)

                    model = lgb.train(params=self.PARAMS,
                                      train_set=train_set, 
                                      valid_sets=[train_set, val_set],
                                    num_boost_round=3000, early_stopping_rounds=200, verbose_eval=500)
                    
                    preds = model.predict(val_X)
                    tmp_null_importance.append(model.feature_importance('gain'))
                null_importance[f'null_importance_{i+1}'] = np.mean(tmp_null_importance, axis=0)
            return null_importance
        except:
            return null_importance

    def calu_importance(self, importance_df, null_importance_df):
        importance_df = pd.merge(
            importance_df, null_importance_df, on='col'
            )
        null_importance_col = [col for col in importance_df.columns if 'null' in col]
        null_importance=pd.DataFrame()
        for idx, row in importance_df.iterrows():
            acc_v = 1e-10+row['true_importance']
            null_v = 1+np.percentile(row[null_importance_col], 75)
            null_importance[row['col']] = [np.log(acc_v/null_v)]
        null_importance = null_importance.T
        return null_importance

    def all_flow(self):
        k = StratifiedKFold(n_splits=5)
        score=[]
        importance=[]

        importance_df=pd.DataFrame()
        importance_df['col'] = self.train_X.columns
        print("""
        
        Train True Importance
        
        """ )
        for trn, val in k.split(self.train_X, self.y_2):
            trn_X, val_X = self.train_X.iloc[trn,:], self.train_X.iloc[val,:]
            trn_y, val_y = self.train_y.iloc[trn].astype(int), self.train_y.iloc[val].astype(int)
            train_set = lgb.Dataset(trn_X, trn_y)
            val_set = lgb.Dataset(val_X, val_y)
            
            PARAMS['random_state']+=1
            model = lgb.train(params=self.PARAMS, train_set=train_set, valid_sets=[train_set, val_set],
                            num_boost_round=3000, early_stopping_rounds=200, verbose_eval=500)
            preds = model.predict(val_X)
            importance.append(model.feature_importance('gain'))
        importance_df['true_importance'] = np.mean(importance, axis=0)
        
        print("""
        
        Train Null Importance
        
        """ )
        try:
            null_importance_df = self.make_null_importance_df()
        except:
            pass
        print("""
        
        Calulate null_null_importance
        
        """ )
        null_importance = self.calu_importance(importance_df, null_importance_df)
        null_importance = null_importance.reset_index()
        null_importance.columns = ['col', 'score']
        null_importance = null_importance.sort_values('score', ascending=False)
        return null_importance

In [14]:
def make_offdf(train_df, test_df, feature, params_list):
    k = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
    
    y_1 = train_df.jobflag.apply(lambda x: 1 if x==1 else 0)
    y_2 = train_df.jobflag.apply(lambda x: 1 if x==2 else 0)
    y_3 = train_df.jobflag.apply(lambda x: 1 if x==3 else 0)
    y_4 = train_df.jobflag.apply(lambda x: 1 if x==4 else 0)
    
    off_df = []
    for i in range(4):
        test_df[f'lgb_preds_{i+1}']=0
    
    for trn, val in k.split(train_df, train_df.jobflag):
        train_X, val_X = train_df.iloc[trn,:][feature], train_df.iloc[val,:][feature]
        tmp_off_df = train_df.iloc[val,:]
        c=1
        for y, param in zip([y_1, y_2, y_3, y_4], params_list):
            tmp_off_df[f'lgb_preds_{c}']=0
            for _ in range(5):
                train_y, val_y = y.iloc[trn], y.iloc[val]
                train_set= lgb.Dataset(train_X,  train_y)
                val_set = lgb.Dataset(val_X,  val_y)   

                model = lgb.train(
                    train_set=train_set, valid_sets=[train_set, val_set], params=param, num_boost_round=3000, 
                    early_stopping_rounds=200, verbose_eval=500
                )
                tmp_off_df[f'lgb_preds_{c}'] += model.predict(val_X)/5
                param['random_state']+=1
                
                test_df[f'lgb_preds_{c}'] += model.predict(test_df[feature])/5
                
            c+=1
        
        off_df.append(tmp_off_df)
    
    for i in range(4):
        test_df[f'lgb_preds_{i+1}']/=5
    
    off_df = pd.concat(off_df, axis=0)
    return off_df.reset_index(drop=True), test_df

In [15]:
feature = ['abil', 'abl', 'accept', 'access', 'accord', 'account', 'accur', 'accuraci', 'achiev', 'acquisit', 'across', 'act', 'action', 
           'activ', 'ad', 'addit', 'address', 'adher', 'administr', 'advanc', 'advis', 'advisor', 'agil', 'agre', 'ai', 'algorithm', 
           'align', 'analys', 'analysi', 'analyst', 'analyt', 'analyz', 'api', 'appli', 'applic', 'approach', 'appropri', 'approv', 
           'architect', 'architectur', 'area', 'assembl', 'assess', 'assign', 'assist', 'audienc', 'autom', 'avail', 'aw', 'back',
           'backend', 'base', 'basic', 'behavior', 'benefit', 'best', 'board', 'bug', 'build', 'busi', 'call', 'can', 'candid', 'capabl',
           'capac', 'case', 'caus', 'challeng', 'chang', 'clearli', 'client', 'clinic', 'close', 'cloud', 'cluster', 'coach', 'code', 
           'collabor', 'collect', 'commerci', 'commiss', 'commun', 'compani', 'complet', 'complex', 'complianc', 'compon', 'comput', 
           'concept', 'conduct', 'confer', 'configur', 'connect', 'consist', 'construct', 'consult', 'content', 'continu', 'contract',
           'contribut', 'control', 'coordin', 'core', 'corpor', 'correct', 'cost', 'creat', 'creation', 'creativ', 'critic', 'cross', 
           'cultur', 'current', 'custom', 'cycl', 'daili', 'dashboard', 'data', 'databas', 'dataset', 'date', 'deadlin', 'debug', 'decis',
           'deep', 'defect', 'defin', 'definit', 'deliv', 'deliver', 'deliveri', 'demand', 'demonstr', 'depart', 'depend', 'deploy', 'depth',
           'deriv', 'design', 'desir', 'detail', 'detect', 'determin', 'develop', 'devic', 'devop', 'differ', 'digit', 'direct', 'disciplin',
           'discoveri', 'discuss', 'distribut', 'divers', 'document', 'domain', 'draw', 'drive', 'duti', 'dynam', 'edg', 'educ', 'effect',
           'effici', 'effort', 'electron', 'email', 'embed', 'employe', 'enabl', 'end', 'engag', 'engin', 'enhanc', 'ensur', 'enterpris',
           'environ', 'equip', 'erp', 'escal', 'establish', 'estim', 'etc', 'evalu', 'event', 'excel', 'execut', 'exist', 'expand', 'experi',
           'expert', 'expertis', 'explain', 'explor', 'exploratori', 'extern', 'extract', 'face', 'facilit', 'failur', 'featur', 'feder', 
           'field', 'find', 'fix', 'flow', 'focu', 'follow', 'form', 'formul', 'framework', 'front', 'full', 'function', 'futur', 'gain',
           'gap', 'gather', 'gener', 'global', 'go', 'goal', 'good', 'govern', 'group', 'grow', 'growth', 'guid', 'guidanc', 'hand', 
           'hardwar', 'healthcar', 'help', 'high', 'highli', 'hoc', 'idea', 'identifi', 'impact', 'implement', 'improv', 'incid', 'includ',
           'increas', 'independ', 'individu', 'industri', 'influenc', 'inform', 'infrastructur', 'initi', 'innov', 'input', 'insight',
           'inspect', 'instal', 'integr', 'intellig', 'interact', 'interfac', 'intern', 'interpret', 'investig', 'issu', 'iter', 'java',
           'job', 'junior', 'keep', 'key', 'knowledg', 'languag', 'larg', 'latest', 'lead', 'leader', 'leadership', 'learn', 'level', 
           'leverag', 'librari', 'life', 'like', 'limit', 'linux', 'log', 'logic', 'machin', 'machinelearn', 'maintain', 'mainten', 
           'make', 'manag', 'manner', 'manufactur', 'map', 'market', 'materi', 'matter', 'may', 'measur', 'mechan', 'medic', 'meet',
           'member', 'mentor', 'met', 'method', 'methodolog', 'metric', 'microsoft', 'migrat', 'mission', 'mobil', 'model', 'moder',
           'modifi', 'modul', 'monitor', 'multi', 'multipl', 'must', 'necessari', 'need', 'net', 'network', 'new', 'next', 'non', 'novel',
           'object', 'obtain', 'ongo', 'open', 'oper', 'opportun', 'optim', 'order', 'organ', 'organiz', 'orient', 'outcom', 'outsid',
           'overal', 'overse', 'part', 'parti', 'particip', 'partner', 'partnership', 'pattern', 'payrol', 'peer', 'perform', 'person',
           'personnel', 'pipelin', 'plan', 'platform', 'point', 'polici', 'posit', 'post', 'potenti', 'practic', 'pre', 'predict', 'prepar', 
           'present', 'price', 'principl', 'prior', 'priorit', 'proactiv', 'problem', 'procedur', 'process', 'produc', 'product', 
           'profession', 'program', 'progress', 'project', 'promot', 'proof', 'propos', 'prospect', 'protocol', 'prototyp','provid', 
           'purpos', 'python', 'qa', 'qualifi', 'qualiti', 'queri', 'question', 'quickli', 'real', 'recommend', 'referr', 'refin', 'regard',
           'region', 'regul', 'regular', 'regulatori', 'relat', 'relationship', 'releas', 'relev', 'reliabl', 'report','repres', 'request',
           'requir', 'research', 'resid', 'resolut', 'resolv', 'resourc', 'respons', 'result', 'retail', 'review', 'rigor', 'risk', 'roadmap',
           'role', 'root', 'rule', 'run', 'safeti', 'sale', 'scalabl', 'scale', 'schedul', 'scienc', 'scientist', 'scope', 'script', 'scrum',
           'secur', 'segment', 'select', 'self', 'sell', 'senior', 'serv', 'server', 'servic', 'set', 'share', 'show', 'simul', 'site',
           'skill', 'small', 'softwar', 'solut', 'solv', 'sourc', 'specif', 'sql', 'stack', 'staff', 'stakehold', 'standard', 'state',
           'statist', 'statu', 'stay', 'store', 'stori', 'strateg', 'strategi', 'stream', 'strong', 'structur', 'studi', 'subject', 
           'success', 'suggest', 'supplier', 'support', 'system', 'take', 'target', 'task', 'team', 'technic', 'techniqu', 'technolog', 
           'term','test', 'think', 'thought', 'throughout', 'time', 'timelin', 'tool', 'top', 'track', 'train', 'transform', 'translat',
           'travel', 'trend', 'troubleshoot', 'tune', 'understand', 'unit', 'updat', 'upgrad', 'use', 'user', 'util', 'valid',
           'valu', 'variou', 'vehicl', 'vendor', 'verif', 'verifi', 'version', 'via', 'vision', 'visual', 'way',
           'web', 'well', 'wide', 'will', 'window', 'within', 'work', 'workflow','write']

PARAMS_1={
    'boosting_type': 'gbdt',
    
    #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
    
    'objective': 'tweedie','metric': 'tweedie',
    
    'n_estimators': 1400,
    'boost_from_average': False,'verbose': -1,'random_state':2020,
    
   'tweedie_variance_power': 1.349969119190657, 'max_bin': 212, 'subsample': 0.5774043241504451, 'subsample_freq': 0.7045972939301558, 
    'learning_rate': 0.16528226095247364, 'num_leaves': 4, 'feature_fraction': 0.9964784224971625,
    'bagging_freq': 6, 'min_child_samples': 23, 'lambda_l1': 0.016924825494747078, 'lambda_l2': 0.0008031532180312293
}


PARAMS_2={
    'boosting_type': 'gbdt',
    
    #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
    
    'objective': 'tweedie','metric': 'tweedie',
    
    'n_estimators': 1400,
    'boost_from_average': False,'verbose': -1,'random_state':2020,
    
    'tweedie_variance_power': 1.3014991003823067, 'max_bin': 134, 'subsample': 0.8990859498726816, 'subsample_freq': 0.5274951186330312,
    'learning_rate': 0.3937162652059595, 'num_leaves': 5, 'feature_fraction': 0.8861294810479933, 'bagging_freq': 5,
    'min_child_samples': 28, 'lambda_l1': 6.037171725930821, 'lambda_l2': 0.0025254105473444784
}

PARAMS_3={
    'boosting_type': 'gbdt',
    
    #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
    
    #'objective': 'tweedie','metric': 'tweedie',
     
    'objective': 'xentropy','metric': 'xentropy',
    
    'n_estimators': 1400,
    'boost_from_average': False,'verbose': -1,'random_state':2020,
    
    'max_bin': 50, 'subsample': 0.8509082362331666, 'subsample_freq': 0.6958806976511948, 'learning_rate': 0.09406169926162017,
    'num_leaves': 7, 'feature_fraction': 0.7562554580497556, 'bagging_freq': 4, 'min_child_samples': 5, 'lambda_l1': 0.00021420978217365439,
    'lambda_l2': 0.011867471326820044
}

PARAMS_4={
    'boosting_type': 'gbdt',
    
    #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
    
    'objective': 'tweedie','metric': 'tweedie',
    
    'n_estimators': 1400,
    'boost_from_average': False,'verbose': -1,'random_state':2020,
    
    'tweedie_variance_power': 1.3572492826220748, 'max_bin': 169, 'subsample': 0.6874225607452877, 'subsample_freq': 0.5369168449326642,
    'learning_rate': 0.0353671206084155, 'num_leaves': 8, 'feature_fraction': 0.9508830019260512, 
    'bagging_freq': 2, 'min_child_samples': 63, 'lambda_l1': 8.281467382972142, 'lambda_l2': 0.1428656656583413
}

param_list = [PARAMS_1, PARAMS_2, PARAMS_3, PARAMS_4]

In [16]:
train_df = pd.read_csv('../train.csv')
test_df = pd.read_csv('../test.csv')
for i in range(4):    
    train_df = pd.merge(train_df, pd.read_csv(f'../train_df_off{i+1}.csv'), on='id')
    test_df = pd.merge(test_df, pd.read_csv(f'../test_df_off{i+1}.csv').drop(columns=['description', 'jobflag']), on='id')
df = pd.concat([train_df, test_df],axis=0,ignore_index=True)
preprocessing = Preprocessing()
df.description = df.description.apply(lambda x: preprocessing.change_text(x))
cols = ['jobflag','id','bert_pred_1_1', 'bert_pred_2_1', 'bert_pred_3_1',
       'bert_pred_4_1', 'bert_pred_5_1', 'bert_pred_1_2', 'bert_pred_2_2',
       'bert_pred_3_2', 'bert_pred_4_2', 'bert_pred_5_2', 'bert_pred_1_3',
       'bert_pred_2_3', 'bert_pred_3_3', 'bert_pred_4_3', 'bert_pred_5_3',
       'bert_pred_1_4', 'bert_pred_2_4', 'bert_pred_3_4', 'bert_pred_4_4',
       'bert_pred_5_4']

In [None]:
X = preprocessing.vectorize_tfidf(df)
X = pd.concat([df[cols], X], axis=1)
train_df = X[X.jobflag.notnull()].reset_index(drop=True)
test_df = X[X.jobflag.isnull()].drop(columns=['jobflag']).reset_index(drop=True)

off_df_tfidf, test_df2_tfidf = make_offdf(train_df, test_df, feature, param_list)

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[54]	training's tweedie: 1.40681	valid_1's tweedie: 1.43566
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[43]	training's tweedie: 1.41926	valid_1's tweedie: 1.45214
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[81]	training's tweedie: 1.38798	valid_1's tweedie: 1.4528
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[65]	training's tweedie: 1.40081	valid_1's tweedie: 1.43151
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[89]	training's tweedie: 1.38559	valid_1's tweedie: 1.45117
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[26]	training's tweedie: 0.927475	valid_1's tweedie: 0.985683
Training until validation scores don't improve for 200 ro

[49]	training's tweedie: 0.910225	valid_1's tweedie: 0.963441
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[78]	training's tweedie: 0.894676	valid_1's tweedie: 0.96469
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[55]	training's tweedie: 0.907872	valid_1's tweedie: 0.957699
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[57]	training's tweedie: 0.903796	valid_1's tweedie: 0.95649
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[171]	training's cross_entropy: 0.363527	valid_1's cross_entropy: 0.487057
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[205]	training's cross_entropy: 0.341988	valid_1's cross_entropy: 0.485309
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[198]	training's cross

In [46]:
pd.concat([
    pd.get_dummies(off_df.jobflag)[[1,2,3,4]],
    off_df[[ 'lgb_preds_1', 'lgb_preds_2', 'lgb_preds_3', 'lgb_preds_4']]
], axis=1).corr().loc[[1,2,3,4], [ 'lgb_preds_1', 'lgb_preds_2', 'lgb_preds_3', 'lgb_preds_4']]

Unnamed: 0,lgb_preds_1,lgb_preds_2,lgb_preds_3,lgb_preds_4
1,0.52536,0.176473,-0.443558,-0.160037
2,0.125056,0.366627,-0.207003,-0.16921
3,-0.404214,-0.241977,0.586404,-0.096219
4,-0.135513,-0.173602,-0.109832,0.419984


In [47]:
off_df.head()

Unnamed: 0,jobflag,id,bert_pred_1_1,bert_pred_2_1,bert_pred_3_1,bert_pred_4_1,bert_pred_5_1,bert_pred_1_2,bert_pred_2_2,bert_pred_3_2,...,yellow,yet,yield,younger,zeiss,zookeep,lgb_preds_1,lgb_preds_2,lgb_preds_3,lgb_preds_4
0,2.0,0,0.524498,0.257973,0.144118,0.133779,0.07674,0.484906,0.294941,0.220875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.124375,0.046654,0.776604,0.049806
1,3.0,9,0.578211,0.409709,0.23759,0.349191,0.117446,0.445859,0.238172,0.217901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.119715,0.038722,0.906843,0.025833
2,3.0,17,0.394083,0.252006,0.214029,0.16986,0.175528,0.664439,0.778905,0.830853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.118658,0.069503,0.623205,0.242803
3,3.0,18,0.450969,0.387301,0.247143,0.448752,0.165567,0.498336,0.543906,0.383343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.145846,0.09025,0.390933,0.200563
4,3.0,20,0.426481,0.272545,0.192479,0.184106,0.098343,0.473353,0.459011,0.220175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.098323,0.045169,0.565293,0.142971


In [48]:
test_df2.head()

Unnamed: 0,id,bert_pred_1_1,bert_pred_2_1,bert_pred_3_1,bert_pred_4_1,bert_pred_5_1,bert_pred_1_2,bert_pred_2_2,bert_pred_3_2,bert_pred_4_2,...,zeiss,zookeep,preds_1,preds_2,preds_3,preds_4,lgb_preds_1,lgb_preds_2,lgb_preds_3,lgb_preds_4
0,2931,0.304999,0.26189,0.256754,0.299635,0.292701,0.487088,0.482487,0.49964,0.490313,...,0.0,0.0,0.286953,0.300346,1.25597,0.246627,0.14616,0.173406,0.528802,0.161975
1,2932,0.157685,0.186549,0.17939,0.162669,0.161642,0.204091,0.194912,0.18307,0.193481,...,0.0,0.0,0.160164,0.229296,1.148036,0.251646,0.063599,0.112505,0.616123,0.151675
2,2933,0.109045,0.131893,0.112129,0.123854,0.127725,0.169313,0.18029,0.184457,0.18645,...,0.0,0.0,0.301584,0.134988,0.992523,0.2775,0.152356,0.077784,0.50615,0.182886
3,2934,0.914908,0.937084,0.923681,0.940199,0.922476,0.194414,0.175685,0.217253,0.189669,...,0.0,0.0,1.596967,0.167668,0.189846,0.133128,0.766366,0.098118,0.095914,0.079734
4,2935,0.129471,0.158606,0.149187,0.163271,0.166768,0.138063,0.127007,0.146903,0.132465,...,0.0,0.0,0.219979,0.06001,1.269596,0.909977,0.094777,0.034124,0.636871,0.592658


In [None]:
k = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
off_df_2=[]
test_preds = np.zeros(shape=(len(test_df2),4))
bert_pred_cols =['bert_pred_1_1', 'bert_pred_2_1', 'bert_pred_3_1', 'bert_pred_4_1', 'bert_pred_5_1', 'bert_pred_1_2', 'bert_pred_2_2',
       'bert_pred_3_2', 'bert_pred_4_2', 'bert_pred_5_2', 'bert_pred_1_3', 'bert_pred_2_3', 'bert_pred_3_3', 'bert_pred_4_3', 
       'bert_pred_5_3', 'bert_pred_1_4', 'bert_pred_2_4', 'bert_pred_3_4', 'bert_pred_4_4', 'bert_pred_5_4']
for trn, val in k.split(train_df, train_df.jobflag):
    trn_df = off_df.iloc[trn,:]
    val_df  =  off_df.iloc[val,:]
    
    min_value = trn_df.jobflag.value_counts().min()
    
    preds = np.zeros(shape=(len(val_df),4))
    
    for i in tqdm(range(80)):
        tmp_trn_df = pd.concat(
        [trn_df[trn_df.jobflag==1].sample(n=min_value, random_state=i),
         trn_df[trn_df.jobflag==2].sample(n=min_value, random_state=i),
         trn_df[trn_df.jobflag==3].sample(n=min_value, random_state=i),
         trn_df[trn_df.jobflag==4].sample(n=min_value, random_state=i)], axis=0).reset_index(drop=True)
        tmp_trn_X = tmp_trn_df[bert_pred_cols+[ 'lgb_preds_1', 'lgb_preds_2', 'lgb_preds_3', 'lgb_preds_4']]
        tmp_trn_y = tmp_trn_df['jobflag']
        
        
        for penalty  in [ 'l2']:
            for m in range(5):
                logit = LogisticRegression(penalty=penalty, random_state=m)
                logit.fit(tmp_trn_X, tmp_trn_y)

                    #ridge_cls = RidgeClassifier()
                    #ridge_cls.fit(tmp_trn_X, tmp_trn_y)

                    #kncls = KNeighborsClassifier(n_neighbors=4)
                    #kncls.fit(tmp_trn_X, tmp_trn_y)
                preds += logit.predict_proba(val_df[bert_pred_cols+[ 'lgb_preds_1', 'lgb_preds_2', 'lgb_preds_3', 'lgb_preds_4']])
                test_preds += logit.predict_proba(test_df2[bert_pred_cols+[ 'lgb_preds_1', 'lgb_preds_2', 'lgb_preds_3', 'lgb_preds_4']])
                
    val_df[f'preds'] = np.argmax(preds, axis=1)+1
    off_df_2.append(val_df)

test_df2[f'preds'] = np.argmax(test_preds, axis=1)+1
off_df_2 = pd.concat(off_df_2, axis=0).reset_index(drop=True)

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

In [None]:
print(metrics.f1_score(off_df_2.jobflag, off_df_2.preds, average='macro'))
plt.figure(figsize=(10,10))
cnfn_matrix = pd.DataFrame(metrics.confusion_matrix(off_df_2.jobflag, off_df_2.preds))
#cnfn_matrix.index = 
sns.heatmap(cnfn_matrix, annot=True, cmap='coolwarm')
plt.show()
cnfn_matrix

In [None]:
test_df2.preds.value_counts()

In [None]:
sub = pd.read_csv('../submit_sample.csv', header=None)

In [None]:
sub[1] = test_df2.preds

In [None]:
sub

In [53]:
sub.to_csv('/Users/kanoumotoharu/Downloads/sub_14.csv',  index=False, header=False)

In [71]:
test_preds/(80*25)

array([[0.10127369, 0.40086803, 0.27877044, 0.21908784],
       [0.08530234, 0.28073404, 0.39763328, 0.23633033],
       [0.17745063, 0.25154499, 0.33936384, 0.23164054],
       ...,
       [0.25766106, 0.2301475 , 0.22386374, 0.2883277 ],
       [0.06947772, 0.0708203 , 0.41543539, 0.44426659],
       [0.07184275, 0.12031271, 0.59142699, 0.21641754]])