## Customer age group prediction

In [None]:
%%writefile word2vec_1.py

### Imports
import pandas as pd
import numpy as np
import os


train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')

p_level = 'CLAC3_NM'

def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 30))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 30))


num_features = 3
min_word_count = 1 
context = 3 


from gensim.models import word2vec

w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)

w2v.init_sims(replace=True)


class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('X_train_w2v_CLAC3_NM.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('X_test_w2v_CLAC3_NM.csv', index=False)

In [None]:
import os
import subprocess

subprocess.run(['python', 'word2vec_1.py'], env={**os.environ, 'PYTHONHASHSEED': '123'})

### CLAC2_NM

In [None]:
%%writefile word2vec_2.py

import pandas as pd
import numpy as np
import os


train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')

p_level = 'CLAC2_NM'


def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 30))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 30))

num_features = 30 
min_word_count = 1 
context = 3 

from gensim.models import word2vec

w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)

w2v.init_sims(replace=True)


class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('X_train_w2v_CLAC2_NM.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('X_test_w2v_CLAC2_NM.csv', index=False)

In [None]:
subprocess.run(['python', 'word2vec_2.py'], env={**os.environ, 'PYTHONHASHSEED': '123'})

### PD_BRA_NM

In [None]:
%%writefile word2vec_3.py

### Imports
import pandas as pd
import numpy as np
import os

train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')


p_level = 'PD_BRA_NM'

def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 30))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 30))

num_features = 30 
min_word_count = 1 
context = 3


from gensim.models import word2vec

w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)
w2v.init_sims(replace=True)

class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('X_train_w2v_PD_BRA_NM.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv(X_test_w2v_PD_BRA_NM.csv', index=False)

In [None]:
subprocess.run(['python', 'word2vec_3.py'], env={**os.environ, 'PYTHONHASHSEED': '123'})

### KWD_NM

In [None]:
%%writefile word2vec_4.py

import pandas as pd
import numpy as np
import os

train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')

p_level = 'KWD_NM'

def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 30))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 30))

num_features = 30 
min_word_count = 1 
context = 3 

from gensim.models import word2vec

w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)

w2v.init_sims(replace=True)


class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('X_train_w2v_KWD_NM.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('X_test_w2v_KWD_NM.csv', index=False)

In [None]:
subprocess.run(['python', 'word2vec_4.py'], env={**os.environ, 'PYTHONHASHSEED': '123'})

### PD_ADD_NM

In [None]:
%%writefile word2vec_5.py

import pandas as pd
import numpy as np
import os

train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')

p_level = 'PD_ADD_NM'

def oversample(x, n, seed=0):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 30))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 30))

num_features = 30 
min_word_count = 1 
context = 3 

from gensim.models import word2vec

w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=0, workers=1)

w2v.init_sims(replace=True)


class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),                
                np.std([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ]) 

train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v'+f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v'+f'{c+1:03d}' for c in test_features.columns]

pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('X_train_w2v_PD_ADD_NM.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('X_test_w2v_PD_ADD_NM.csv', index=False)

In [None]:
subprocess.run(['python', 'word2vec_5.py'], env={**os.environ, 'PYTHONHASHSEED': '123'})

### Modeling

In [None]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.base import ClassifierMixin
from gensim.models import word2vec


# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from bayes_opt import BayesianOptimization

### Data Load & Merge

In [None]:
train = pd.read_csv('../dataset/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('../dataset/L.POINT_test.csv', encoding='UTF-8')

y_target =  pd.read_csv('../dataset/y_train.csv').LABEL

In [None]:
train_w2v_1 = pd.read_csv(word2vec_path +'X_train_w2v_CLAC3_NM.csv', encoding='cp949')
test_w2v_1= pd.read_csv(word2vec_path +'X_test_w2v_CLAC3_NM.csv', encoding='cp949')

train_w2v_2  = pd.read_csv(word2vec_path +'X_train_w2v_CLAC2_NM.csv', encoding='cp949')
test_w2v_2 = pd.read_csv(word2vec_path +'X_test_w2v_CLAC2_NM.csv', encoding='cp949')

train_w2v_3  = pd.read_csv(word2vec_path +'X_train_w2v_PD_BRA_NM.csv', encoding='cp949')
test_w2v_3 = pd.read_csv(word2vec_path +'X_test_w2v_PD_BRA_NM.csv', encoding='cp949')

train_w2v_4  = pd.read_csv(word2vec_path +'X_train_w2v_KWD_NM.csv', encoding='cp949')
test_w2v_4 = pd.read_csv(word2vec_path +'X_test_w2v_KWD_NM.csv', encoding='cp949')

train_w2v_5  = pd.read_csv(word2vec_path +'X_train_w2v_PD_ADD_NM.csv', encoding='cp949')
test_w2v_5 = pd.read_csv(word2vec_path +'X_test_w2v_PD_ADD_NM.csv', encoding='cp949')

In [None]:
train_w2v = pd.concat([train_w2v_1, train_w2v_2, train_w2v_3, train_w2v_4, train_w2v_5], axis=1)

In [None]:
test_w2v = pd.concat([test_w2v_1, test_w2v_2, test_w2v_3, test_w2v_4, test_w2v_5], axis=1)

In [None]:
train_w2v.shape, test_w2v.shape

In [None]:
cust_tr = train_w2v_1.CLNT_ID
cust_te = test_w2v_1.CLNT_ID

In [None]:
features_w2v = pd.concat([train_w2v, test_w2v] ,axis=0).reset_index()

In [None]:
features_w2v.drop(columns='index', inplace=True)

In [None]:
features_w2v

### Feature Selection

In [None]:
features_w2v = pd.concat([pd.concat([cust_tr, cust_te]).reset_index(drop=True), features_w2v], axis=1)
X_train = features_w2v.query('CLNT_ID in @cust_tr').drop('CLNT_ID', axis=1)
X_test = features_w2v.query('CLNT_ID in @cust_te').drop('CLNT_ID', axis=1)

model = LogisticRegression(random_state=0)
sfk = StratifiedKFold(n_splits = 5)

cv_scores = [] 
for p in tqdm(range(1,100,1)):
    X_new = SelectPercentile(percentile=p).fit_transform(X_train, y_target)    
    cv_score = cross_val_score(model, X_new, y_target, scoring='neg_log_loss', cv=sfk).mean()
    cv_scores.append((p,cv_score))

best_score = cv_scores[np.argmax([score for _, score in cv_scores])]
print(best_score)

plt.plot([k for k, _ in cv_scores], [score for _, score in cv_scores])
plt.xlabel('Percent of features')
plt.grid()

In [None]:
fs = SelectPercentile(percentile=best_score[0]).fit(X_train, y_target)
X_train = fs.transform(X_train)
X_test = fs.transform(X_test)

print(X_train.shape, X_test.shape)

In [None]:
# 데이터파일 저장
X_train = pd.DataFrame(X_train)
X_train.to_csv('train_w2v.csv')

X_test = pd.DataFrame(X_test)
X_test.to_csv('test_w2v.csv')

### Split Data

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_target, test_size=0.3, stratify = y_target, random_state=42)

In [None]:
X_train.shape, X_dev.shape, y_train.shape, y_dev.shape

### LGBM Modeling

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
model_lgb = LGBMClassifier(random_state=42)
param_grid = {'n_estimators': [100,200,300,400,500],
              'objective' : ['multiclass'],
             'learning_rate': [0.01,0.1],
              'max_depth': [3,4,5,6,10],
              'num_leaves': [16,32,64,128]}

rcv_lgb = RandomizedSearchCV(model_lgb, param_distributions=param_grid ,cv=skf, scoring='neg_log_loss', n_iter=10)
rcv_lgb.fit(X_train,y_train)

print('랜덤서치(skf) 점수', -(rcv_lgb.score(X_dev,y_dev)))
print('최적 파라미터', rcv_lgb.best_params_)   
print('최고 점수', -rcv_lgb.best_score_)    

### Make Submisson

In [None]:
pred = pd.DataFrame(rcv_lgb.predict_proba(X_test))

result = pd.concat([cust_te , pred], axis =1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('submission.csv',index=False)

## END