In [1]:
import numpy as np              # arrays
import pandas as pd             # dataframes
import matplotlib.pyplot as plt # graphs
import seaborn as sns           # visualisations
from scipy import stats         # statistics

from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model
%matplotlib inline

### 데이터 읽어오기!

In [2]:
data_raw = pd.read_csv(
    filepath_or_buffer='./input/Speed Dating Data.csv',
    engine='python'
)

In [3]:
data_raw.shape

(8378, 195)

In [4]:
relevant_features = [
    ['iid', 'int16'],
    ['gender', 'bool'],
    ['wave', 'int16'],
    ['position', 'int16'],
    ['order', 'int16'],
    ['pid', 'int16'],
    ['age_o', 'int16'],
    ['pf_o_att', 'int16'],
    ['pf_o_sin', 'int16'],
    ['pf_o_int', 'int16'],
    ['pf_o_fun', 'int16'],
    ['pf_o_amb', 'int16'],
    ['pf_o_sha', 'int16'],
    ['dec_o', 'bool'],
    ['age', 'int16'],
    ['field_cd', 'category'],
    ['goal', 'category'],
    ['date', 'int16'],
    ['go_out', 'int16'],
    ['career_c', 'category'],
    ['sports', 'int16'],
    ['exercise', 'int16'],
    ['dining', 'int16'],
    ['museums', 'int16'],
    ['art', 'int16'],
    ['hiking', 'int16'],
    ['reading', 'int16'],
    ['tv', 'int16'],
    ['theater', 'int16'],
    ['movies', 'int16'],
    ['concerts', 'int16'],
    ['music', 'int16'],
    ['shopping', 'int16'],
    ['exphappy', 'int16'],
    ['attr1_1', 'int16'],
    ['sinc1_1', 'int16'],
    ['intel1_1', 'int16'],
    ['fun1_1', 'int16'],
    ['amb1_1', 'int16'],
    ['shar1_1', 'int16'],
    ['dec', 'bool'],
    ['like', 'int16'],
    ['match', 'bool']
]

In [5]:
# create new dataframe containing relevant features
data = data_raw[[feature[0] for feature in relevant_features]]

# 데이터 타입 바꾸기

In [6]:
data = data.astype({feature: datatype if all(data[feature].notna().values) 
                                    else 'float32' 
                                    if datatype == 'int16' 
                                    else datatype for (feature, datatype) in relevant_features})

In [7]:
partner_accepts = data['dec']
round(partner_accepts[partner_accepts == True].count()/partner_accepts.count(),3)

0.42

In [8]:
missing_samples_proportion = data.isnull().sum()/len(data)
missing_samples_proportion.sort_values(ascending=False).head(10)

missing_half_samples = missing_samples_proportion[missing_samples_proportion > 0.2].index.values
data.drop(columns=missing_half_samples, inplace=True)

In [9]:
data.isnull().sum(axis=1)

0       1
1       1
2       1
3       1
4       1
       ..
8373    0
8374    0
8375    1
8376    0
8377    0
Length: 8378, dtype: int64

# 빈칸 채우기

In [10]:
imputer = IterativeImputer(
    missing_values=np.nan,
    sample_posterior=True,
    n_nearest_features=5,
    min_value=0,
    max_value=100,
    random_state=0
)
imputer.fit(data)
data_imputed = np.around(imputer.transform(data))
data = pd.DataFrame(data_imputed, columns=data.columns)

# 내가 만난 모든 상대방의 가치관 평균 점수

In [11]:
subject_attractiveness_mean = data[['iid', 'pf_o_att']].groupby(['iid']).mean()['pf_o_att']
subject_sincerity_mean = data[['iid', 'pf_o_sin']].groupby(['iid']).mean()['pf_o_sin']
subject_intelligence_mean = data[['iid', 'pf_o_int']].groupby(['iid']).mean()['pf_o_int']
subject_fun_mean = data[['iid', 'pf_o_fun']].groupby(['iid']).mean()['pf_o_fun']
subject_ambition_mean = data[['iid', 'pf_o_amb']].groupby(['iid']).mean()['pf_o_amb']
subject_shared_interest_mean = data[['iid', 'pf_o_sha']].groupby(['iid']).mean()['pf_o_sha']

In [12]:
data = data.merge(
    right=subject_attractiveness_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_att_x': 'pf_o_att',
    'pf_o_att_y': 'subject_attractiveness_mean'
})
data = data.merge(
    right=subject_sincerity_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_sin_x': 'pf_o_sin',
    'pf_o_sin_y': 'subject_sincerity_mean'
})
data = data.merge(
    right=subject_intelligence_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_int_x': 'pf_o_int',
    'pf_o_int_y': 'subject_intelligence_mean'
})
data = data.merge(
    right=subject_fun_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_fun_x': 'pf_o_fun',
    'pf_o_fun_y': 'subject_fun_mean'
})
data = data.merge(
    right=subject_ambition_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_amb_x': 'pf_o_amb',
    'pf_o_amb_y': 'subject_ambition_mean'
})
data = data.merge(
    right=subject_shared_interest_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_sha_x': 'pf_o_sha',
    'pf_o_sha_y': 'subject_shared_interest_mean'
})

In [13]:
test_unique_id = data.groupby(['iid'])

test_data = test_unique_id.mean()
test_data.head(5)

Unnamed: 0_level_0,gender,wave,position,order,pid,age_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,...,shar1_1,dec,like,match,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,1.0,7.0,5.5,15.5,25.2,44.2,7.9,16.5,16.9,...,15.0,0.8,6.5,0.4,44.2,7.9,16.5,16.9,4.5,9.9
2.0,0.0,1.0,3.0,5.5,15.5,25.2,44.2,7.9,16.5,16.9,...,5.0,0.4,6.6,0.2,44.2,7.9,16.5,16.9,4.5,9.9
3.0,0.0,1.0,9.0,5.5,15.5,25.2,44.2,7.9,16.5,16.9,...,0.0,0.0,8.2,0.0,44.2,7.9,16.5,16.9,4.5,9.9
4.0,0.0,1.0,6.0,5.5,15.5,25.2,44.2,7.9,16.5,16.9,...,10.0,0.3,6.6,0.2,44.2,7.9,16.5,16.9,4.5,9.9
5.0,0.0,1.0,4.0,5.5,15.5,25.2,44.2,7.9,16.5,16.9,...,15.0,0.6,7.2,0.2,44.2,7.9,16.5,16.9,4.5,9.9


In [14]:
test_data

Unnamed: 0_level_0,gender,wave,position,order,pid,age_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,...,shar1_1,dec,like,match,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,1.0,7.0,5.5,15.5,25.200000,44.200000,7.900000,16.500000,16.900000,...,15.0,0.800000,6.500000,0.400000,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
2.0,0.0,1.0,3.0,5.5,15.5,25.200000,44.200000,7.900000,16.500000,16.900000,...,5.0,0.400000,6.600000,0.200000,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
3.0,0.0,1.0,9.0,5.5,15.5,25.200000,44.200000,7.900000,16.500000,16.900000,...,0.0,0.000000,8.200000,0.000000,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
4.0,0.0,1.0,6.0,5.5,15.5,25.200000,44.200000,7.900000,16.500000,16.900000,...,10.0,0.300000,6.600000,0.200000,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
5.0,0.0,1.0,4.0,5.5,15.5,25.200000,44.200000,7.900000,16.500000,16.900000,...,15.0,0.600000,7.200000,0.200000,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548.0,1.0,21.0,11.5,11.5,519.5,25.181818,21.727273,18.181818,21.272727,16.681818,...,20.0,0.409091,5.000000,0.227273,21.727273,18.181818,21.272727,16.681818,8.636364,13.363636
549.0,1.0,21.0,11.5,11.5,519.5,25.136364,21.727273,18.181818,21.272727,16.681818,...,20.0,0.409091,5.363636,0.227273,21.727273,18.181818,21.272727,16.681818,8.636364,13.818182
550.0,1.0,21.0,11.5,11.5,519.5,25.272727,21.727273,18.181818,21.272727,16.681818,...,4.0,0.318182,5.500000,0.181818,21.727273,18.181818,21.272727,16.681818,8.636364,13.909091
551.0,1.0,21.0,11.5,11.5,519.5,25.181818,21.727273,18.181818,21.272727,16.681818,...,0.0,0.363636,6.818182,0.090909,21.727273,18.181818,21.272727,16.681818,8.636364,13.500000


# 학과 클러스터링

In [15]:
data.columns

Index(['iid', 'gender', 'wave', 'position', 'order', 'pid', 'age_o',
       'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha',
       'dec_o', 'age', 'field_cd', 'goal', 'date', 'go_out', 'career_c',
       'sports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'reading',
       'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'exphappy',
       'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'dec',
       'like', 'match', 'subject_attractiveness_mean',
       'subject_sincerity_mean', 'subject_intelligence_mean',
       'subject_fun_mean', 'subject_ambition_mean',
       'subject_shared_interest_mean'],
      dtype='object')

In [16]:
data['field_cd'].value_counts()

8.0     1936
10.0    1003
5.0      871
13.0     712
3.0      701
1.0      665
9.0      635
11.0     480
6.0      331
7.0      251
2.0      210
15.0     187
4.0      144
14.0     126
18.0      53
16.0      40
12.0      22
17.0      10
0.0        1
Name: field_cd, dtype: int64

In [17]:
humanities = [6.0, 7.0]
societies = [1.0, 3.0, 8.0, 11.0, 13.0]
natural = [2.0, 10.0]
engineers = [5.0]
medical = [4.0]
academic = [9.0]
art = [14.0, 15.0]
other = [0.0, 12.0, 16.0, 17.0, 18.0]

before = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0]
after = [8, 2, 3, 2, 5, 4, 1, 1, 2, 6, 3, 2, 8, 2, 7, 7, 8, 8, 8]

data['field_cd'] = data['field_cd'].replace(before, after)

In [18]:
print(test_data.columns)
print(test_data.shape)

Index(['gender', 'wave', 'position', 'order', 'pid', 'age_o', 'pf_o_att',
       'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha', 'dec_o',
       'age', 'field_cd', 'goal', 'date', 'go_out', 'career_c', 'sports',
       'exercise', 'dining', 'museums', 'art', 'hiking', 'reading', 'tv',
       'theater', 'movies', 'concerts', 'music', 'shopping', 'exphappy',
       'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'dec',
       'like', 'match', 'subject_attractiveness_mean',
       'subject_sincerity_mean', 'subject_intelligence_mean',
       'subject_fun_mean', 'subject_ambition_mean',
       'subject_shared_interest_mean'],
      dtype='object')
(551, 48)


# 상대방 학과 추가

In [19]:
data.drop_duplicates(subset='iid').set_index('iid')

Unnamed: 0_level_0,gender,wave,position,order,pid,age_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,...,shar1_1,dec,like,match,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,1.0,7.0,4.0,11.0,27.0,35.0,20.0,20.0,20.0,...,15.0,1.0,7.0,0.0,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
2.0,0.0,1.0,3.0,10.0,11.0,27.0,35.0,20.0,20.0,20.0,...,5.0,0.0,6.0,0.0,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
3.0,0.0,1.0,9.0,6.0,11.0,27.0,35.0,20.0,20.0,20.0,...,0.0,0.0,8.0,0.0,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
4.0,0.0,1.0,6.0,3.0,11.0,27.0,35.0,20.0,20.0,20.0,...,10.0,0.0,6.0,0.0,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
5.0,0.0,1.0,4.0,1.0,11.0,27.0,35.0,20.0,20.0,20.0,...,15.0,0.0,7.0,0.0,44.200000,7.900000,16.500000,16.900000,4.500000,9.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548.0,1.0,21.0,21.0,13.0,509.0,28.0,10.0,20.0,30.0,10.0,...,20.0,1.0,6.0,1.0,21.727273,18.181818,21.272727,16.681818,8.636364,13.363636
549.0,1.0,21.0,21.0,15.0,509.0,28.0,10.0,20.0,30.0,10.0,...,20.0,0.0,3.0,0.0,21.727273,18.181818,21.272727,16.681818,8.636364,13.818182
550.0,1.0,21.0,21.0,2.0,509.0,28.0,10.0,20.0,30.0,10.0,...,4.0,0.0,5.0,0.0,21.727273,18.181818,21.272727,16.681818,8.636364,13.909091
551.0,1.0,21.0,21.0,1.0,509.0,28.0,10.0,20.0,30.0,10.0,...,0.0,1.0,7.0,0.0,21.727273,18.181818,21.272727,16.681818,8.636364,13.500000


In [20]:
data['field_o_cd'] = data.drop_duplicates(subset='iid').set_index('iid').loc[np.array(data['pid'])]['field_cd'].values
data = data.astype({'field_o_cd': 'category'})

In [21]:
data[['field_cd', 'field_o_cd']].tail(5)

Unnamed: 0,field_cd,field_o_cd
8373,8.0,2.0
8374,8.0,2.0
8375,8.0,2.0
8376,8.0,4.0
8377,8.0,4.0


# 상대방 goal 추가

In [22]:
data['goal_o'] = data.drop_duplicates(subset='iid').set_index('iid').loc[np.array(data['pid'])]['goal'].values
data = data.astype({'goal_o': 'category'})

In [23]:
data = data.drop(index=data[data['goal'] == 0.0].index)
data = data.drop(index=data[data['goal_o'] == 0.0].index)

In [24]:
data = data.astype({feature: datatype if all(data[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

features_nominal = data.dtypes[data.dtypes == 'category'].index.values
data = pd.get_dummies(data, prefix=features_nominal)


In [25]:
data.columns

Index(['iid', 'gender', 'wave', 'position', 'order', 'pid', 'age_o',
       'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha',
       'dec_o', 'age', 'date', 'go_out', 'sports', 'exercise', 'dining',
       'museums', 'art', 'hiking', 'reading', 'tv', 'theater', 'movies',
       'concerts', 'music', 'shopping', 'exphappy', 'attr1_1', 'sinc1_1',
       'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'dec', 'like', 'match',
       'subject_attractiveness_mean', 'subject_sincerity_mean',
       'subject_intelligence_mean', 'subject_fun_mean',
       'subject_ambition_mean', 'subject_shared_interest_mean', 'field_cd_1.0',
       'field_cd_2.0', 'field_cd_3.0', 'field_cd_4.0', 'field_cd_5.0',
       'field_cd_6.0', 'field_cd_7.0', 'field_cd_8.0', 'goal_1.0', 'goal_2.0',
       'goal_3.0', 'goal_4.0', 'goal_5.0', 'goal_6.0', 'career_c_0.0',
       'career_c_1.0', 'career_c_2.0', 'career_c_3.0', 'career_c_4.0',
       'career_c_5.0', 'career_c_6.0', 'career_c_7.0', 'career_c_8

# 나이 차이

In [26]:
data['age_difference'] = abs(data['age'] - data['age_o'])

# 가치관 차이

In [27]:
data['attractiveness_difference'] = abs(data['attr1_1'] - data['pf_o_att'])
data['sincerity_difference'] = abs(data['sinc1_1'] - data['pf_o_sin'])
data['intelligence_difference'] = abs(data['intel1_1'] - data['pf_o_int'])
data['fun_difference'] = abs(data['fun1_1'] - data['pf_o_fun'])
data['ambition_difference'] = abs(data['amb1_1'] - data['pf_o_amb'])
data['shared_interest_difference'] = abs(data['shar1_1'] - data['pf_o_sha'])

# 불필요한 columns 제거

In [28]:
features_no_information = [
    'iid',
    'pid',
    'wave',
    'position',
    'order',
    'age_o'
]

features_future_information = [
    'dec',
    'dec_o',
    'match',
    'like'
]
features_unwanted = [
     'date', 'go_out', 'exphappy'
]
career_cols = ['career_c_0.0', 'career_c_1.0', 'career_c_2.0', 'career_c_3.0', 'career_c_4.0',
              'career_c_5.0',  'career_c_6.0', 'career_c_7.0', 'career_c_8.0',
               'career_c_9.0', 'career_c_10.0', 'career_c_11.0',
              'career_c_12.0', 'career_c_13.0', 'career_c_14.0', 
              'career_c_15.0', 'career_c_16.0', 'career_c_17.0',]


In [29]:
features_remove = features_no_information+features_future_information+features_unwanted+career_cols
data_model = data.drop(columns=features_remove)


In [30]:
print(data_model.columns)
print(data_model.shape)

Index(['gender', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb',
       'pf_o_sha', 'age', 'sports', 'exercise', 'dining', 'museums', 'art',
       'hiking', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music',
       'shopping', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1',
       'shar1_1', 'subject_attractiveness_mean', 'subject_sincerity_mean',
       'subject_intelligence_mean', 'subject_fun_mean',
       'subject_ambition_mean', 'subject_shared_interest_mean', 'field_cd_1.0',
       'field_cd_2.0', 'field_cd_3.0', 'field_cd_4.0', 'field_cd_5.0',
       'field_cd_6.0', 'field_cd_7.0', 'field_cd_8.0', 'goal_1.0', 'goal_2.0',
       'goal_3.0', 'goal_4.0', 'goal_5.0', 'goal_6.0', 'field_o_cd_1.0',
       'field_o_cd_2.0', 'field_o_cd_3.0', 'field_o_cd_4.0', 'field_o_cd_5.0',
       'field_o_cd_6.0', 'field_o_cd_7.0', 'field_o_cd_8.0', 'goal_o_1.0',
       'goal_o_2.0', 'goal_o_3.0', 'goal_o_4.0', 'goal_o_5.0', 'goal_o_6.0',
       'age_difference', 'attract

In [31]:
data_model = data_model.drop(columns=['field_cd_8.0', 'field_o_cd_8.0'])

In [32]:
data_model.columns

Index(['gender', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb',
       'pf_o_sha', 'age', 'sports', 'exercise', 'dining', 'museums', 'art',
       'hiking', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music',
       'shopping', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1',
       'shar1_1', 'subject_attractiveness_mean', 'subject_sincerity_mean',
       'subject_intelligence_mean', 'subject_fun_mean',
       'subject_ambition_mean', 'subject_shared_interest_mean', 'field_cd_1.0',
       'field_cd_2.0', 'field_cd_3.0', 'field_cd_4.0', 'field_cd_5.0',
       'field_cd_6.0', 'field_cd_7.0', 'goal_1.0', 'goal_2.0', 'goal_3.0',
       'goal_4.0', 'goal_5.0', 'goal_6.0', 'field_o_cd_1.0', 'field_o_cd_2.0',
       'field_o_cd_3.0', 'field_o_cd_4.0', 'field_o_cd_5.0', 'field_o_cd_6.0',
       'field_o_cd_7.0', 'goal_o_1.0', 'goal_o_2.0', 'goal_o_3.0',
       'goal_o_4.0', 'goal_o_5.0', 'goal_o_6.0', 'age_difference',
       'attractiveness_difference', 'sincerity_di

# 상대방 관심사 추가하기

In [33]:
# 반복문
iid_list = list(data['iid'].values)
pid_list = list(data['pid'].values)

tuple_list = []
for i in range(len(iid_list)):
    tuple_list.append((iid_list[i], pid_list[i]))

In [34]:
interest_all = ['exercise', 'hiking', 'sports', 'art', 'museums', 'reading', 'shopping', 
           'dining', 'theater', 'concerts', 'movies', 'tv', 'music']


In [35]:
interest_diff = [[0]*8378] * len(interest_all)

for e in range(len(interest_all)):
    for i, (iid, pid) in enumerate(tuple_list):
        interest_diff[e][i] = (data_model.iloc[i])[interest_all[e]] - test_data.loc[pid][interest_all[e]]

interest_series = [0]* len(interest_all)
for i in range(len(interest_series)):
    col_name = interest_all[i] + "_diff"
    temp_series = pd.Series(interest_diff[i])
    data_model[col_name] = temp_series

In [36]:
features_normal = [i for i in list(data_model.columns) if "diff" in i or "mean" in i]
print(features_normal)

data_model[features_normal] = data_model[features_normal].apply(lambda x: preprocessing.scale(x))

['subject_attractiveness_mean', 'subject_sincerity_mean', 'subject_intelligence_mean', 'subject_fun_mean', 'subject_ambition_mean', 'subject_shared_interest_mean', 'age_difference', 'attractiveness_difference', 'sincerity_difference', 'intelligence_difference', 'fun_difference', 'ambition_difference', 'shared_interest_difference', 'exercise_diff', 'hiking_diff', 'sports_diff', 'art_diff', 'museums_diff', 'reading_diff', 'shopping_diff', 'dining_diff', 'theater_diff', 'concerts_diff', 'movies_diff', 'tv_diff', 'music_diff']


In [37]:
col_list = [
    'gender',
    'age',
    'attr1_1',
    'sinc1_1',
    'intel1_1',
    'fun1_1',
    'amb1_1',
    'shar1_1',
    
    'pf_o_att',
    'pf_o_sin',
    'pf_o_int',
    'pf_o_fun',
    'pf_o_amb',
    'pf_o_sha',
    
     'exercise',
     'hiking',
     'sports',
     'art',
     'museums',
    'reading',
    'shopping',
    'dining',
    'theater',
    'concerts',
    'movies',
    'tv',
    'music',
    'age_difference',
    
 'attractiveness_difference',
 'sincerity_difference',
 'intelligence_difference',
 'fun_difference',
 'ambition_difference',
 'shared_interest_difference',
    
     'exercise_diff',
     'hiking_diff',
     'sports_diff',
     'art_diff',
     'museums_diff',
    'reading_diff',
    'shopping_diff',
    'dining_diff',
    'theater_diff',
    'concerts_diff',
    'movies_diff',
    'tv_diff',
    'music_diff',
    
 'field_cd_1.0',
 'field_cd_2.0',
 'field_cd_3.0',
 'field_cd_4.0',
 'field_cd_5.0',
 'field_cd_6.0',
 'field_cd_7.0',
    
 'goal_1.0',
 'goal_2.0',
 'goal_3.0',
 'goal_4.0',
 'goal_5.0',
 'goal_6.0',
 'goal_o_1.0',
 'goal_o_2.0',
 'goal_o_3.0',
 'goal_o_4.0',
 'goal_o_5.0',
 'goal_o_6.0',
    
 'field_o_cd_1.0',
 'field_o_cd_2.0',
 'field_o_cd_3.0',
 'field_o_cd_4.0',
 'field_o_cd_5.0',
 'field_o_cd_6.0',
 'field_o_cd_7.0',
    
 'subject_attractiveness_mean',
 'subject_sincerity_mean',
 'subject_intelligence_mean',
 'subject_fun_mean',
 'subject_ambition_mean',
 'subject_shared_interest_mean',
 ]

In [38]:
data_model = data_model[col_list]

In [40]:
data_model.shape

(8374, 79)

In [71]:
import pickle
# X dumping
with open("./train_x.pickle", "wb") as f:
    pickle.dump(data_model, f)

# y dumping
with open("./train_y.pickle", "wb") as f:
    pickle.dump(data['dec'], f)

# Modeling

In [72]:
data['match'].value_counts() # 상대방이 나를 좋아할 여부

False    6995
True     1379
Name: match, dtype: int64

In [73]:
features = data_model  # X
target = data['match'] # Y   나와 상대방 모두 좋아하는지 여부

In [74]:
parameters = {
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'C': np.logspace(-4, 4, 20),
    'max_iter': [100000]
}
classifier_lr = LogisticRegression(random_state=0)  
classifier_lr = GridSearchCV( 
    estimator=classifier_lr,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_lr.fit(features, target)
classifier_lr.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   41.1s finished


{'C': 0.0001, 'max_iter': 100000, 'penalty': 'l2', 'solver': 'lbfgs'}

In [75]:
classifier_lr = LogisticRegression(
    random_state=0,
    penalty=classifier_lr.best_params_['penalty'],
    solver=classifier_lr.best_params_['solver'],
    C=classifier_lr.best_params_['C'],
    max_iter=classifier_lr.best_params_['max_iter']
)

In [51]:
# parameters = {
#     'kernel': ['rbf'],
#     'gamma': [1e-4, 1e-3, 1e-2],
#     'C': [1, 10, 100]
# }
# classifier_sv = SVC(random_state=0)
# classifier_sv = GridSearchCV(
#     estimator=classifier_sv,
#     param_grid=parameters,
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )
# classifier_sv.fit(features, target)
# classifier_sv.best_params_

In [56]:
parameters = {
    'n_neighbors': [5,11,19,29],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}
classifier_kn = KNeighborsClassifier()
classifier_kn = GridSearchCV(
    estimator=classifier_kn,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_kn.fit(features, target)
classifier_kn.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   33.5s finished


{'metric': 'manhattan', 'n_neighbors': 29, 'weights': 'distance'}

In [58]:
classifier_kn = KNeighborsClassifier(
    n_neighbors=classifier_kn.best_params_['n_neighbors'],
    weights=classifier_kn.best_params_['weights'],
    metric=classifier_kn.best_params_['metric']
)

AttributeError: 'KNeighborsClassifier' object has no attribute 'best_params_'

In [59]:
# parameters = {
#     'loss': ['deviance', 'exponential'],
#     'learning_rate': [0.05],
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 4, 5, 6, 7],
#     'max_features': ['sqrt', 'log2']
# }
# classifier_gb = GradientBoostingClassifier(random_state=0)
# classifier_gb = GridSearchCV(
#     estimator=classifier_gb,
#     param_grid=parameters,
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )
# classifier_gb.fit(features, target)
# classifier_gb.best_params_

In [60]:
# classifier_gb = GradientBoostingClassifier(
#     random_state=0,
#     loss=classifier_gb.best_params_['loss'],
#     learning_rate=classifier_gb.best_params_['learning_rate'],
#     n_estimators=classifier_gb.best_params_['n_estimators'],
#     max_depth=classifier_gb.best_params_['max_depth'],
#     max_features=classifier_gb.best_params_['max_features']
# )

In [61]:
from xgboost import XGBClassifier
def hyper_parameter_tuning(parameters,model,c_v):
    grid_search = GridSearchCV(model,
                               parameters,
                               cv = c_v,
                               n_jobs = 10,
                               verbose = True)
    grid_search.fit(features,target)
    #print("All Scores =",grid_search.cv_results_)
    print("Best Score =",grid_search.best_score_)
    print("Best Params =",grid_search.best_params_)
    return(grid_search.best_score_,grid_search.best_params_)

In [62]:
#Hyper-Parameter-Tuning for XGBClassifier.
hyper_parameter_tuning({'learning_rate':[0.01,0.1], 
                        'n_estimators':[140,200], 
                        'max_depth':[4,5,7],
                        'min_child_weight':[2,3,4], 
                        'gamma':[0.2], 
                        'subsample':[0.6,0.8], 
                        'colsample_bytree':[0.7,1.0],
                        'objective':['binary:logistic'],
                        'seed':[27]}, XGBClassifier(),5)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    9.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  3.0min
[Parallel(n_jobs=10)]: Done 720 out of 720 | elapsed:  5.6min finished


Best Score = 0.5876510636780257
Best Params = {'colsample_bytree': 0.7, 'gamma': 0.2, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 140, 'objective': 'binary:logistic', 'seed': 27, 'subsample': 0.8}


(0.5876510636780257,
 {'colsample_bytree': 0.7,
  'gamma': 0.2,
  'learning_rate': 0.01,
  'max_depth': 4,
  'min_child_weight': 2,
  'n_estimators': 140,
  'objective': 'binary:logistic',
  'seed': 27,
  'subsample': 0.8})

In [56]:
features.shape

(8378, 76)

# 모델 저장 불러오기 테스트

In [55]:
classifier_xgb.save_model('./xgb.model')

In [66]:
import xgboost
from xgboost import XGBClassifier

classifier_xgb = XGBClassifier(learning_rate=0.01, 
                        n_estimators=140, 
                        max_depth=4,
                        min_child_weight=2, 
                        gamma=0.2, 
                        subsample=0.8, 
                        colsample_bytree=0.7,
                        objective='binary:logistic')

In [62]:
model.load_model('./xgb.model')

In [65]:
test_df = features.iloc[:100]
print(test_df.shape)
print(type(test_df))

(100, 76)
<class 'pandas.core.frame.DataFrame'>


In [67]:
import pickle
with open("./test_df.pickle", "wb") as f:
    pickle.dump(test_df, f)

# 앙상블 보델 및 검증

In [67]:
estimators = [
    ('lr', classifier_lr),
    ('kn', classifier_kn),
    ('xgb', classifier_xgb)
]

In [68]:
classifier_ve = VotingClassifier(
    estimators=estimators,
    voting='hard'
)

# evaluation

In [69]:
metrics = ['accuracy', 'precision', 'recall', 'f1_macro']

for classifier, label in zip(
    [classifier_lr, classifier_kn, classifier_xgb, classifier_ve],
    ['Logistic Regression', 'k-Nearest Neighbours', 'XGB', 'Voting Ensemble']
):
    print('{}'.format(label))
    scores = cross_validate(
        estimator=classifier,
        X=features,
        y=target,
        scoring=metrics,
        cv=5,
        n_jobs=-1
    )
    for key, value in scores.items():
        print('{:14} {:.3f} +/- {:.3f}'.format(key, value.mean(), value.std()))
    print('\n')

Logistic Regression
fit_time       0.345 +/- 0.020
score_time     0.009 +/- 0.001
test_accuracy  0.583 +/- 0.028
test_precision 0.520 +/- 0.063
test_recall    0.341 +/- 0.174
test_f1_macro  0.530 +/- 0.040


k-Nearest Neighbours
fit_time       0.075 +/- 0.000
score_time     1.635 +/- 0.010
test_accuracy  0.558 +/- 0.021
test_precision 0.468 +/- 0.036
test_recall    0.309 +/- 0.107
test_f1_macro  0.509 +/- 0.013


XGB
fit_time       1.028 +/- 0.021
score_time     0.016 +/- 0.000
test_accuracy  0.586 +/- 0.014
test_precision 0.553 +/- 0.105
test_recall    0.229 +/- 0.200
test_f1_macro  0.485 +/- 0.063


Voting Ensemble
fit_time       1.462 +/- 0.071
score_time     1.607 +/- 0.014
test_accuracy  0.583 +/- 0.021
test_precision 0.537 +/- 0.098
test_recall    0.244 +/- 0.184
test_f1_macro  0.494 +/- 0.048




In [81]:
classifier_xgb.predict(data_model.iloc[1:3])

array([ True,  True])

In [87]:
classifier_xgb.save_model('./model.json')

In [88]:
classifier_xgb.load_model('./model.json')

In [54]:
import pickle
with open("./speed_date_xgb_model_scikit_learn.pickle", "wb") as f:
    pickle.dump(classifier_xgb, f)

In [66]:
import pickle

with open("./predict_df.pickle", "rb") as f:
    df = pickle.load(f)

In [67]:
with open("./speed_date_xgb_model_scikit_learn.pickle", "rb") as f:
    model = pickle.load(f)

In [68]:
model

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.7, gamma=0.2,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=None, max_depth=5,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              n_estimators=140, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, seed=27, subsample=0.6, tree_method=None,
              validate_parameters=None, verbosity=None)