In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV, ShapleyImportanceEvaluator
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
plt.rc('font', family='malgun gothic')

In [2]:
dr = 'data/'

X_train = pd.read_csv(dr + 'X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv(dr + 'y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv(dr + 'X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

data = pd.concat([X_train, X_test]).reset_index().drop(columns='index')

data.loc[data[data['직종']=='호텔/콘도/리조트'].index, '직종'] = '기타 직종'

# 확정

In [3]:
# 자격증 보기 쉽게 변환
data['자격증'] = data['자격증'].str.replace('無', 'x')
data['자격증'] = data['자격증'].str.replace('有', 'o')

# 어학 시험 전처리
data['어학시험'] = data['어학시험'].fillna('없음')

# 대학 전공 변환
col_list = []
abc = []
for i in data.대학전공:
    if '(' in i:
        for j in range(len(i)):
            if i[j] == '(':
                a = j
            elif i[j] == ')':
                b = j
        last = i[b+1:]
        first = i[:a]
        i = first + last
        abc.append(i)
    col_list.append(i)    
data['대학전공'] = col_list


#
data['대학전공'] = data['대학전공'].str.replace(',',' ')
data['대학전공'] = data['대학전공'].str.replace('/', ' ')
data['대학전공'] = data['대학전공'].str.replace('.',' ')
data['대학전공'] = data['대학전공'].str.strip()
data['대학전공'] = data['대학전공'].str.replace(' ','')


#
col = []
for i in data['대학전공']:
    if i[-1]=='과':
        i = i[:-1]
    col.append(i)    
data['대학전공'] = col


#
col2 = []
for i in data['대학전공']:
    if i[-1]!='학':
        i = i+'학'
    col2.append(i)    
data['대학전공'] = col2

# 세부직종 처리
def full(job):
    value = data[data['세부직종']==job]['직무태그'].value_counts().index[0]
    data.loc[data[data['세부직종']==job]['직무태그'].index, '직무태그'] = data[data['세부직종']==job]['직무태그'].fillna(value)
    
for i in data.세부직종.value_counts().index:
    full(i)
    
# 근무지역

null_list = []
for i in data.근무지역.str.replace(' ',','):
    for j in i.split(','):
        null_list.append(j)

# out = ['필리핀', '인도네시아', '대만', '프랑스', '방글라데시', '해외', '미국', '러시아', '말레이시아', '인도', '일본', '싱가포르', '중국', '홍콩', '캐나다']
# for i in out:
#     data['근무지역'] = data['근무지역'].str.replace(' ',',').str.replace(i, '해외')

a = []    
for i in data['근무지역']:
    a.append(sorted(list(set(i.split(',')))))
    
b = []
for i in a:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    b.append(result)
data['근무지역'] = b

language3 = []
for i in data['어학시험']:
    if i[:5] == 'TOEFL':
        i = '토플'
    language3.append(i) 
data['어학시험'] = language3

# 여기까지는 무조건 필수

# 근무형태 처리 / NOT PCA

In [4]:
# 근무 형태 처리
null_list = data[data['근무경력']==0].index
data['근무형태'][null_list] = data['근무형태'][null_list].fillna('경력없음')
data['근무형태'] = data['근무형태'].fillna('missing')

#
data['근무형태'] = data['근무형태'].str.replace(',', ' ')
data['근무형태'] = data['근무형태'].str.strip()

#
hyung_list = []
for i in data['근무형태']:
    if i[:3] == '정규직' and '해외취업' in i:
        i = '정규직(해외o)'
    elif i[:3] == '정규직' and '해외취업' not in i:
        i = '정규직(해외x)'
    hyung_list.append(i)    
data['근무형태'] = hyung_list

#
hyun_list2 = []
for i in data['근무형태']:
    if i[:3] == '계약직':
        i = '계약직'
    elif i in ['인턴','파견직']:
        i = '계약직'
    elif i in ['해외취업','병역특례']:
        i = '기타'    
    hyun_list2.append(i)   
data['근무형태'] = hyun_list2

# 근무지역 처리 / NOT PCA

In [5]:
# 근무지역

null_list = []
for i in data.근무지역.str.replace(' ',','):
    for j in i.split(','):
        null_list.append(j)

out = ['필리핀', '인도네시아', '대만', '프랑스', '방글라데시', '해외', '미국', '러시아', '말레이시아', '인도', '일본', '싱가포르', '중국', '홍콩', '캐나다']
for i in out:
    data['근무지역'] = data['근무지역'].str.replace(' ',',').str.replace(i, '해외')
   
a = []
for i in data['근무지역']:
    a.append(sorted(list(set(i.split(',')))))
   
b = []
for i in a:
    result = ' '.join(map(str, i))
    result = result.rstrip()
    result = result.lstrip()
    b.append(result)
   
data['근무지역'] = b

# 직무태그 처리

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = data['직무태그']

vectorizer = CountVectorizer()

features = vectorizer.fit_transform(sentences)
vocab = vectorizer.get_feature_names()
new = pd.DataFrame(features.toarray(), columns = vocab)

X_train_New = new[:16570]
X_test_New = new[16570:]

max_d = num_d = new.shape[1]
pca = PCA(n_components=max_d).fit(X_train_New)
cumsum = np.cumsum(pca.explained_variance_ratio_)
num_d = np.argmax(cumsum >= 0.65) + 1
if num_d == 1: num_d = max_d
pca = PCA(n_components=num_d, random_state=0)  
X_train_pca = pca.fit_transform(X_train_New)
X_test_pca = pca.transform(X_test_New)
print(X_train_pca.shape)

train = pd.DataFrame(X_train_pca)
test = pd.DataFrame(X_test_pca)

all_pca = pd.concat([train, test]).reset_index().drop(columns = 'index')
data = pd.concat([data, all_pca], axis=1).drop(columns='직무태그')

(16570, 180)


In [7]:
data.columns = [i for i in range(data.columns.size)]

In [8]:
X_train = data[:16570]
X_test = data[16570:]

X_test = X_test.reset_index()
X_test.drop(columns='index', inplace=True)

numeric_features = data.select_dtypes('number').columns
categorical_features = data.select_dtypes('object').columns

In [9]:
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=True)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
        ("selector", SelectPercentile(percentile=100)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", Ridge(alpha=1.0)), # regressor가 되어야 함 #alpha가 낮을수록 과적합 Viceversa 
    ]
)

set_config(display="diagram")

In [10]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5) 

print("Default LM CV scores: ", np.sqrt(-1*scores))
print("Default LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


Default LM CV scores:  [ 796.39144999  782.02111551  860.72314728  923.95550826 1035.24489241]
Default LM CV mean = 884.54 with std = 410.27


In [11]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10) 

print("Default LM CV scores: ", np.sqrt(-1*scores))
print("Default LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


Default LM CV scores:  [ 779.56237654  796.25188903  781.41147558  776.83103888  851.57028607
  867.22384492  869.05305774  948.88683839  944.68117737 1072.92975993]
Default LM CV mean = 873.63 with std = 408.09


In [12]:
%%time

param_distributions = { 
    "preprocessor__selector__percentile": IntDistribution(70,100,step=5), 
    "classifier__alpha": IntDistribution(1,10),
}

optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_search = OptunaSearchCV(model, param_distributions, cv=5, 
                               scoring='neg_mean_squared_error', n_trials=20,
                               study=optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction='maximize'))
optuna_search.fit(X_train, y_train)

CPU times: total: 8min 17s
Wall time: 8min 23s


In [13]:
print(f"\nBest params: {optuna_search.best_params_}")
print(f"\nBest score: {np.sqrt(-1*optuna_search.best_score_):.2f}")


Best params: {'preprocessor__selector__percentile': 95, 'classifier__alpha': 3}

Best score: 878.77


In [14]:
model.set_params(**optuna_search.best_params_)

# OOF Prediction
models = cross_validate(model, 
                        X_train, y_train, 
                        cv=10, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nTuned LM CV scores: ", np.sqrt(-1*scores))
print("Tuned LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


Tuned LM CV scores:  [ 773.5268048   789.88658801  764.56722025  772.34341998  839.95981083
  854.83306237  864.60109749  948.67279865  949.87462406 1065.49680222]
Tuned LM CV mean = 867.45 with std = 411.32


In [15]:
# # submission 화일 생성

# LM_VERSION = 7.0

# filename = f'LM_{LM_VERSION}_{np.sqrt(-1*scores.mean()):.2f}_{np.sqrt(scores.std()):.2f}.csv'
# pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)