In [2]:
import numpy as np              # arrays
import pandas as pd             # dataframes
import matplotlib.pyplot as plt # graphs
import seaborn as sns           # visualisations
from scipy import stats         # statistics

from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model
%matplotlib inline

In [3]:
data_raw = pd.read_csv(
    filepath_or_buffer='./tmi_temp_user_data.csv',
    engine='python',
    index_col='Unnamed: 0'
)

In [4]:
data_raw.head(5)

Unnamed: 0,iid,gender,age,field,attr,sinc,intel,fun,amb,shar,...,museums,reading,shopping,dining,theater,concerts,movies,tv,music,goal
1,2,Female,24.0,1.0,45.0,5.0,25.0,20.0,0.0,5.0,...,8.0,10.0,3.0,10.0,9.0,7.0,8.0,1.0,8.0,1.0
18,19,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,15.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
19,20,Male,24.0,8.0,100.0,0.0,0.0,0.0,0.0,0.0,...,9.0,8.0,5.0,10.0,8.0,6.0,8.0,3.0,6.0,1.0
30,31,Female,24.0,1.0,35.0,10.0,20.0,20.0,10.0,5.0,...,7.0,6.0,10.0,10.0,8.0,6.0,8.0,8.0,7.0,1.0
34,35,Female,25.0,2.0,20.0,23.0,23.0,22.0,7.0,5.0,...,8.0,7.0,8.0,8.0,8.0,7.0,8.0,9.0,7.0,1.0


In [5]:
male_ids = list(data_raw[data_raw['gender'] == 'Male']['iid'])
female_ids = list(data_raw[data_raw['gender'] == 'Female']['iid'])

total_ids = male_ids + female_ids

print("male ids: ", len(male_ids))
print("female ids: ", len(female_ids))

match_pair = []

for m in male_ids: # 45
    for f in female_ids: # 47
        match_tuple = (m, f)
        match_pair.append(match_tuple)

for f in female_ids:
    for m in male_ids:
        match_tuple = (f, m)
        match_pair.append(match_tuple)
        
print("total match_pair:", len(match_pair))

male ids:  45
female ids:  47
total match_pair: 4230


In [6]:
df_pair = pd.DataFrame(index=match_pair, columns=['iid'])

In [7]:
df_pair['iid'] = np.array(tuple(df_pair.index))[:, 0]
df_pair['pid'] = np.array(tuple(df_pair.index))[:, 1]

In [8]:
df_pair

Unnamed: 0,iid,pid
"(19, 2)",19,2
"(19, 31)",19,31
"(19, 35)",19,35
"(19, 60)",19,60
"(19, 84)",19,84
...,...,...
"(526, 533)",526,533
"(526, 537)",526,537
"(526, 539)",526,539
"(526, 541)",526,541


In [9]:
data_raw.index = data_raw['iid']

In [10]:
select_cols = list(data_raw.columns)
select_cols.remove('iid')

for col in select_cols:
    df_pair[col] = data_raw.loc[np.array(df_pair['iid'])][select_cols][col].values

In [11]:
df_pair

Unnamed: 0,iid,pid,gender,age,field,attr,sinc,intel,fun,amb,...,museums,reading,shopping,dining,theater,concerts,movies,tv,music,goal
"(19, 2)",19,2,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
"(19, 31)",19,31,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
"(19, 35)",19,35,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
"(19, 60)",19,60,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
"(19, 84)",19,84,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,5.0,8.0,7.0,10.0,1.0,7.0,7.0,1.0,7.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(526, 533)",526,533,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,10.0,8.0,4.0,10.0,10.0,6.0,8.0,4.0,8.0,1.0
"(526, 537)",526,537,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,10.0,8.0,4.0,10.0,10.0,6.0,8.0,4.0,8.0,1.0
"(526, 539)",526,539,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,10.0,8.0,4.0,10.0,10.0,6.0,8.0,4.0,8.0,1.0
"(526, 541)",526,541,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,10.0,8.0,4.0,10.0,10.0,6.0,8.0,4.0,8.0,1.0


In [12]:
interest_all = ['exercise', 'hiking', 'sports', 'art', 'museums', 'reading', 'shopping', 
           'dining', 'theater', 'concerts', 'movies', 'tv', 'music']

# 나이 차이 추가

In [13]:
df_pair['age_diff'] = data_raw.loc[np.array(df_pair['iid'])]['age'].values - data_raw.loc[np.array(df_pair['pid'])]['age'].values

# 상대방 가치관 추가

In [14]:
preferences = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
pref_opp = ['pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha']


for i in range(len(preferences)):
    df_pair[pref_opp[i]] = data_raw.loc[np.array(df_pair['pid'])][preferences[i]].values

# 상대방과 나의 가치관  차이 점수 추가

In [15]:
preferences = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
pref_diff = ['attr_diff', 'sinc_diff', 'intel_diff', 'fun_diff', 'amb_diff', 'shar_diff']


for i in range(len(preferences)):
    df_pair[pref_diff[i]] = data_raw.loc[np.array(df_pair['iid'])][preferences[i]].values - data_raw.loc[np.array(df_pair['pid'])][preferences[i]].values

In [16]:
df_pair

Unnamed: 0,iid,pid,gender,age,field,attr,sinc,intel,fun,amb,...,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,shar_diff
"(19, 2)",19,2,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,25.0,20.0,0.0,5.0,5.0,-5.0,0.0,-10.0,0.0,10.0
"(19, 31)",19,31,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,20.0,20.0,10.0,5.0,15.0,-10.0,5.0,-10.0,-10.0,10.0
"(19, 35)",19,35,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,23.0,22.0,7.0,5.0,30.0,-23.0,2.0,-12.0,-7.0,10.0
"(19, 60)",19,60,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,25.0,15.0,15.0,20.0,45.0,-20.0,0.0,-5.0,-15.0,-5.0
"(19, 84)",19,84,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,30.0,5.0,0.0,5.0,20.0,-30.0,-5.0,5.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(526, 533)",526,533,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,25.0,13.0,12.0,0.0,-15.0,-15.0,5.0,7.0,-2.0,15.0
"(526, 537)",526,537,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,25.0,25.0,5.0,20.0,-10.0,5.0,5.0,-5.0,5.0,-5.0
"(526, 539)",526,539,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,20.0,15.0,15.0,10.0,-20.0,0.0,10.0,5.0,-5.0,5.0
"(526, 541)",526,541,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,0.0,20.0,0.0,10.0,-30.0,-20.0,30.0,0.0,10.0,5.0


# 상대방 학과 추가

In [17]:
df_pair['field_o'] = data_raw.loc[np.array(df_pair['pid'])]['field'].values

# 상대방 goal 추가

In [18]:
df_pair['goal_o'] = data_raw.loc[np.array(df_pair['pid'])]['goal'].values

In [19]:
df_pair

Unnamed: 0,iid,pid,gender,age,field,attr,sinc,intel,fun,amb,...,pf_o_amb,pf_o_sha,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,shar_diff,field_o,goal_o
"(19, 2)",19,2,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,0.0,5.0,5.0,-5.0,0.0,-10.0,0.0,10.0,1.0,1.0
"(19, 31)",19,31,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,10.0,5.0,15.0,-10.0,5.0,-10.0,-10.0,10.0,1.0,1.0
"(19, 35)",19,35,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,7.0,5.0,30.0,-23.0,2.0,-12.0,-7.0,10.0,2.0,1.0
"(19, 60)",19,60,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,15.0,20.0,45.0,-20.0,0.0,-5.0,-15.0,-5.0,3.0,4.0
"(19, 84)",19,84,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,0.0,5.0,20.0,-30.0,-5.0,5.0,0.0,10.0,10.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(526, 533)",526,533,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,12.0,0.0,-15.0,-15.0,5.0,7.0,-2.0,15.0,5.0,2.0
"(526, 537)",526,537,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,5.0,20.0,-10.0,5.0,5.0,-5.0,5.0,-5.0,5.0,1.0
"(526, 539)",526,539,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,15.0,10.0,-20.0,0.0,10.0,5.0,-5.0,5.0,1.0,1.0
"(526, 541)",526,541,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,0.0,10.0,-30.0,-20.0,30.0,0.0,10.0,5.0,5.0,3.0


# 관심사 차이 정도 추가

In [20]:
interest_diff = ['exercise_diff', 'hiking_diff', 'sports_diff',
                 'art_diff', 'museums_diff', 'reading_diff', 'shopping_diff', 
                 'dining_diff', 'theater_diff', 'concerts_diff', 'movies_diff',
                 'tv_diff', 'music_diff']

for i in range(len(interest_all)):
    df_pair[interest_diff[i]] = data_raw.loc[np.array(df_pair['pid'])][interest_all[i]].values - data_raw.loc[np.array(df_pair['iid'])][interest_all[i]].values

In [21]:
df_pair

Unnamed: 0,iid,pid,gender,age,field,attr,sinc,intel,fun,amb,...,art_diff,museums_diff,reading_diff,shopping_diff,dining_diff,theater_diff,concerts_diff,movies_diff,tv_diff,music_diff
"(19, 2)",19,2,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,1.0,3.0,2.0,-4.0,0.0,8.0,0.0,1.0,0.0,1.0
"(19, 31)",19,31,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,2.0,2.0,-2.0,3.0,0.0,7.0,-1.0,1.0,7.0,0.0
"(19, 35)",19,35,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,3.0,3.0,-1.0,1.0,-2.0,7.0,0.0,1.0,8.0,0.0
"(19, 60)",19,60,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,0.0,0.0,-3.0,-2.0,-1.0,9.0,-1.0,0.0,1.0,2.0
"(19, 84)",19,84,Male,28.0,8.0,50.0,0.0,25.0,10.0,0.0,...,4.0,4.0,1.0,-2.0,-1.0,8.0,2.0,3.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(526, 533)",526,533,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,-2.0,-5.0,-5.0,-2.0,-3.0,-5.0,0.0,-1.0,-3.0,-1.0
"(526, 537)",526,537,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,1.0,-2.0,2.0,6.0,0.0,0.0,4.0,2.0,4.0,2.0
"(526, 539)",526,539,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,2.0,-1.0,0.0,3.0,-1.0,-3.0,1.0,1.0,2.0,-1.0
"(526, 541)",526,541,Female,26.0,11.0,10.0,10.0,30.0,20.0,10.0,...,-2.0,-7.0,-5.0,5.0,-2.0,-5.0,4.0,2.0,6.0,1.0


In [30]:
result_df = df_pair[['iid', 'pid']].__deepcopy__()

result_df['new'] = df_pair['art_diff']

result_df.shape

(4230, 3)

# 학과 클러스터링

In [22]:
df_pair['field'].value_counts()

8.0     1073
10.0     550
5.0      419
13.0     417
9.0      362
3.0      362
1.0      229
2.0      184
6.0      182
4.0      182
11.0      90
15.0      90
14.0      45
7.0       45
Name: field, dtype: int64

In [23]:
humanities = [6.0, 7.0]
societies = [1.0, 3.0, 8.0, 11.0, 13.0]
natural = [2.0, 10.0]
engineers = [5.0]
medical = [4.0]
academic = [9.0]
art = [14.0, 15.0]

before = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]
after = [2, 3, 2, 5, 4, 1, 1, 2, 6, 3, 2, 8, 2, 7, 7]

df_pair['field'] = df_pair['field'].replace(before, after)

In [24]:
df_pair['field'].value_counts()

2.0    2171
3.0     734
4.0     419
6.0     362
1.0     227
5.0     182
7.0     135
Name: field, dtype: int64

In [25]:
df_pair['field_o'] = df_pair['field_o'].replace(before, after)
df_pair['field_o'].value_counts()

2.0    2171
3.0     734
4.0     419
6.0     362
1.0     227
5.0     182
7.0     135
Name: field_o, dtype: int64

# nominal features one-hot encoding

In [26]:
df_pair['gender'] = df_pair.gender.map({'Male': False, 'Female': True}).fillna(df_pair.gender)
df_pair.gender.value_counts(dropna=False)

True     2115
False    2115
Name: gender, dtype: int64

In [27]:
df_pair = df_pair.astype({'field': 'category', 'field_o': 'category', 
                          'goal': 'category', 'goal_o': 'category'})

features_nominal = df_pair.dtypes[df_pair.dtypes == 'category'].index.values
print(features_nominal)
df_pair = pd.get_dummies(df_pair, prefix=features_nominal)

['field' 'goal' 'field_o' 'goal_o']


In [28]:
df_pair.columns

Index(['iid', 'pid', 'gender', 'age', 'attr', 'sinc', 'intel', 'fun', 'amb',
       'shar', 'exercise', 'hiking', 'sports', 'art', 'museums', 'reading',
       'shopping', 'dining', 'theater', 'concerts', 'movies', 'tv', 'music',
       'age_diff', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb',
       'pf_o_sha', 'attr_diff', 'sinc_diff', 'intel_diff', 'fun_diff',
       'amb_diff', 'shar_diff', 'exercise_diff', 'hiking_diff', 'sports_diff',
       'art_diff', 'museums_diff', 'reading_diff', 'shopping_diff',
       'dining_diff', 'theater_diff', 'concerts_diff', 'movies_diff',
       'tv_diff', 'music_diff', 'field_1.0', 'field_2.0', 'field_3.0',
       'field_4.0', 'field_5.0', 'field_6.0', 'field_7.0', 'goal_1.0',
       'goal_2.0', 'goal_3.0', 'goal_4.0', 'goal_5.0', 'goal_6.0',
       'field_o_1.0', 'field_o_2.0', 'field_o_3.0', 'field_o_4.0',
       'field_o_5.0', 'field_o_6.0', 'field_o_7.0', 'goal_o_1.0', 'goal_o_2.0',
       'goal_o_3.0', 'goal_o_4.0', 'goal_o_5.0'

# 이성의 모든 가치관 평균 점수

In [29]:
subject_attractiveness_mean = df_pair[['iid', 'pf_o_att']].groupby(['iid']).mean()['pf_o_att']
subject_sincerity_mean = df_pair[['iid', 'pf_o_sin']].groupby(['iid']).mean()['pf_o_sin']
subject_intelligence_mean = df_pair[['iid', 'pf_o_int']].groupby(['iid']).mean()['pf_o_int']
subject_fun_mean = df_pair[['iid', 'pf_o_fun']].groupby(['iid']).mean()['pf_o_fun']
subject_ambition_mean = df_pair[['iid', 'pf_o_amb']].groupby(['iid']).mean()['pf_o_amb']
subject_shared_interest_mean = df_pair[['iid', 'pf_o_sha']].groupby(['iid']).mean()['pf_o_sha']

In [30]:
df_pair = df_pair.merge(
    right=subject_attractiveness_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_att_x': 'pf_o_att',
    'pf_o_att_y': 'subject_attractiveness_mean'
})
df_pair = df_pair.merge(
    right=subject_sincerity_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_sin_x': 'pf_o_sin',
    'pf_o_sin_y': 'subject_sincerity_mean'
})
df_pair = df_pair.merge(
    right=subject_intelligence_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_int_x': 'pf_o_int',
    'pf_o_int_y': 'subject_intelligence_mean'
})
df_pair = df_pair.merge(
    right=subject_fun_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_fun_x': 'pf_o_fun',
    'pf_o_fun_y': 'subject_fun_mean'
})
df_pair = df_pair.merge(
    right=subject_ambition_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_amb_x': 'pf_o_amb',
    'pf_o_amb_y': 'subject_ambition_mean'
})
df_pair = df_pair.merge(
    right=subject_shared_interest_mean,
    how='inner',
    on='iid'
).rename(columns={
    'pf_o_sha_x': 'pf_o_sha',
    'pf_o_sha_y': 'subject_shared_interest_mean'
})

In [1]:
df_pair

NameError: name 'df_pair' is not defined

In [32]:
df_pair.columns

Index(['iid', 'pid', 'gender', 'age', 'attr', 'sinc', 'intel', 'fun', 'amb',
       'shar', 'exercise', 'hiking', 'sports', 'art', 'museums', 'reading',
       'shopping', 'dining', 'theater', 'concerts', 'movies', 'tv', 'music',
       'age_diff', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb',
       'pf_o_sha', 'attr_diff', 'sinc_diff', 'intel_diff', 'fun_diff',
       'amb_diff', 'shar_diff', 'exercise_diff', 'hiking_diff', 'sports_diff',
       'art_diff', 'museums_diff', 'reading_diff', 'shopping_diff',
       'dining_diff', 'theater_diff', 'concerts_diff', 'movies_diff',
       'tv_diff', 'music_diff', 'field_1.0', 'field_2.0', 'field_3.0',
       'field_4.0', 'field_5.0', 'field_6.0', 'field_7.0', 'goal_1.0',
       'goal_2.0', 'goal_3.0', 'goal_4.0', 'goal_5.0', 'goal_6.0',
       'field_o_1.0', 'field_o_2.0', 'field_o_3.0', 'field_o_4.0',
       'field_o_5.0', 'field_o_6.0', 'field_o_7.0', 'goal_o_1.0', 'goal_o_2.0',
       'goal_o_3.0', 'goal_o_4.0', 'goal_o_5.0'

In [33]:
df_pair.columns.shape

(81,)