# 전처리

In [211]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

mpl.style.use('seaborn')
mpl.rcParams["font.family"] = 'Malgun Gothic'
mpl.rcParams["axes.unicode_minus"] = False

In [212]:
import time
def my_time(func):
    def wrapper():
        str_time = time.time()
        func()
        end_time = time.time()
        print(end_time-str_time, '초 소요')
    return wrapper

In [213]:
X_train = pd.read_csv('../data/aug_train.csv')
X_test = pd.read_csv('../data/aug_test.csv')

In [214]:
X_test['target'] = -1

In [215]:
X_train.shape, X_train.columns, X_test.shape, X_test.columns

((19158, 14),
 Index(['enrollee_id', 'city', 'city_development_index', 'gender',
        'relevent_experience', 'enrolled_university', 'education_level',
        'major_discipline', 'experience', 'company_size', 'company_type',
        'last_new_job', 'training_hours', 'target'],
       dtype='object'),
 (2129, 14),
 Index(['enrollee_id', 'city', 'city_development_index', 'gender',
        'relevent_experience', 'enrolled_university', 'education_level',
        'major_discipline', 'experience', 'company_size', 'company_type',
        'last_new_job', 'training_hours', 'target'],
       dtype='object'))

In [216]:
X = pd.concat([X_train, X_test])
# X.drop(columns=['enrollee_id','target'],inplace=True)

In [217]:
X.shape

(21287, 14)

#### 해당 컬럼부터 숫자형으로 변경

In [218]:
numeric_feature = ['city_development_index', 'training_hours',
                   'education_level', 'last_new_job',  'experience', 'enrolled_university']

#### edu_lvl

In [219]:
education_level = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd']
for i, v in enumerate(education_level):
    X.loc[X['education_level']==v, 'education_level'] = i

#### last_new_job

In [220]:
X.loc[X['last_new_job'] == '>4', 'last_new_job'] = 5
X.loc[X['last_new_job'] == 'never', 'last_new_job'] = 0

X['last_new_job'] = pd.to_numeric(X['last_new_job'], downcast='integer')

#### exp

In [221]:
X.loc[X['experience'] == '>20', 'experience'] = 21
X.loc[X['experience'] == '<1', 'experience'] = 0

X['experience'] = pd.to_numeric(X['experience'], downcast='integer')

#### enroll

In [222]:
enrolls = ['no_enrollment','Part time course','Full time course']
for i,v in enumerate(enrolls):
    X.loc[X['enrolled_university'] == v, 'enrolled_university'] = i

#### gender

In [227]:
X['gender'].fillna('Male', inplace=True)

#### company_size, type

In [228]:
X.loc[X['company_size']=='<10', 'company_size'] = '~10'
X.loc[X['company_size']=='10/49', 'company_size'] = '10~49'

company_siz = ['~10', '10~49', '50-99', '100-500', '500-999','1000-4999','5000-9999','10000+']
for i, v in enumerate(company_siz):
    X.loc[X['company_size']==v, 'company_size'] = i

#### major:  최빈값과 나머지의 합과의 비율 차이가 크므로 최빈값 major로 처리

In [230]:
nomajor_lst = list(X['major_discipline'].value_counts().index)
nomajor_lst.remove('STEM')
print(nomajor_lst)

['Humanities', 'Other', 'Business Degree', 'Arts', 'No Major']


In [231]:
X.loc[(X['major_discipline'] == 'STEM')|(X['major_discipline'].isna()) , 'major_discipline'] = 'major'
X.loc[X['major_discipline'] != 'major', 'major_discipline'] = 'no_major'

### 인코딩 전 현재 위아래로 concat되있어 
### 중복되있으므로 인덱스 순번대로 재지정

In [232]:
X.reset_index(drop=True, inplace=True)

### 라벨링

In [233]:
to_lb = list(X.columns[X[X.columns].dtypes == 'object'])
to_lb

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_size',
 'company_type']

In [234]:
def lbe(x):
    if x.name in to_lb:
        return LabelEncoder().fit_transform(x)
    else:
        return x
    
X_new = X.apply(lbe)
X_new

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,5,0.920,1,0,0,2,0,21.0,8,6,1.0,36,1.0
1,29725,77,0.776,1,1,0,2,0,15.0,2,5,5.0,47,0.0
2,11561,64,0.624,1,1,2,2,0,5.0,8,6,0.0,83,0.0
3,33241,14,0.789,1,1,3,2,1,0.0,8,5,0.0,52,1.0
4,666,50,0.767,1,0,0,3,0,21.0,2,1,4.0,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21282,1289,5,0.920,1,1,0,2,1,16.0,8,4,4.0,15,-1.0
21283,195,30,0.897,1,0,0,3,0,18.0,8,6,2.0,30,-1.0
21284,31762,2,0.887,1,1,0,0,0,3.0,8,5,0.0,18,-1.0
21285,7873,4,0.804,1,0,2,1,0,7.0,3,4,1.0,84,-1.0


#### 남은 결측치는 KNN으로

In [237]:
under_list

['enrolled_university',
 'education_level',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

In [238]:
from sklearn.impute import KNNImputer

under_list = []
for col in X_new.columns:
    if X_new[col].isna().sum():
        under_list.append(col)
        
imputer = KNNImputer(n_neighbors = 1)
X_new[under_list] = imputer.fit_transform(X_new[under_list])

### StandardScaled

In [239]:
from sklearn.preprocessing import StandardScaler

y_tmp = X_new['target']
X_scaled = StandardScaler().fit_transform(X_new)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_new.columns)
X_scaled_df.drop(columns='target',inplace=True)
X_scaled_df = pd.concat([X_scaled_df, y_tmp], axis=1)

#### enrollee_id 제거, 다시 train,test 분리 후 target 분리

In [240]:
X_scaled_df.loc[X_scaled_df.index,'target']

0        1.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
21282   -1.0
21283   -1.0
21284   -1.0
21285   -1.0
21286   -1.0
Name: target, Length: 21287, dtype: float64

In [241]:
X_train = X_scaled_df.loc[X_scaled_df.loc[X_scaled_df.index, 'target'] != -1]
X_test = X_scaled_df.loc[X_scaled_df.loc[X_scaled_df.index, 'target'] == -1]
X_test.drop(columns='target',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


#### 테이블 저장

In [242]:
X_train.to_csv('../data/prepcd_lbe_train.csv', index=False)
X_test.to_csv('../data/prepcd_lbe_test.csv', index=False)