In [181]:
import numpy as np              # arrays
import pandas as pd             # dataframes
import matplotlib.pyplot as plt # graphs
import seaborn as sns           # visualisations
from scipy import stats         # statistics

from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model
%matplotlib inline

In [182]:
data_raw = pd.read_csv(
    filepath_or_buffer='./input/Speed Dating Data.csv',
    engine='python'
)

In [183]:
cols = list(data_raw.columns)

for ele in cols:
    if 'rel' in ele:
        print(ele)

imprelig


In [184]:
relevant_features = [
    ['iid', 'int16'],
    ['gender', 'string'],
    ['age', 'int16'],
    ['field_cd', 'category'],
    ['attr1_1', 'int16'],
    ['sinc1_1', 'int16'],
    ['intel1_1', 'int16'],
    ['fun1_1', 'int16'],
    ['amb1_1', 'int16'],
    ['shar1_1', 'int16'],
    ['exercise', 'int16'],
    ['hiking', 'int16'],
    ['sports', 'int16'],
    ['art', 'int16'],
    ['museums', 'int16'],
    ['reading', 'int16'],
    ['shopping', 'int16'],
    ['dining', 'int16'],
    ['theater', 'int16'],
    ['concerts', 'int16'],
    ['movies', 'int16'],
    ['tv', 'int16'],
    ['music', 'int16'],
    ['goal', 'category'],
]

In [185]:
# create new dataframe containing relevant features
data = data_raw[[feature[0] for feature in relevant_features]]

In [186]:
data['gender'] = data.gender.map({1 : 'Male', 0 : 'Female'}).fillna(data.gender)

data.gender.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['gender'] = data.gender.map({1 : 'Male', 0 : 'Female'}).fillna(data.gender)


Male      4194
Female    4184
Name: gender, dtype: int64

In [187]:
data = data.astype({feature: datatype if all(data[feature].notna().values) 
                                    else 'float32' 
                                    if datatype == 'int16' 
                                    else datatype for (feature, datatype) in relevant_features})

In [188]:
data

Unnamed: 0,iid,gender,age,field_cd,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,exercise,hiking,sports,art,museums,reading,shopping,dining,theater,concerts,movies,tv,music,goal
0,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
1,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
2,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
3,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
4,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,Male,25.0,18.0,70.0,0.0,15.0,15.0,0.0,0.0,5.0,7.0,8.0,10.0,10.0,8.0,7.0,10.0,7.0,10.0,9.0,3.0,10.0,1.0
8374,552,Male,25.0,18.0,70.0,0.0,15.0,15.0,0.0,0.0,5.0,7.0,8.0,10.0,10.0,8.0,7.0,10.0,7.0,10.0,9.0,3.0,10.0,1.0
8375,552,Male,25.0,18.0,70.0,0.0,15.0,15.0,0.0,0.0,5.0,7.0,8.0,10.0,10.0,8.0,7.0,10.0,7.0,10.0,9.0,3.0,10.0,1.0
8376,552,Male,25.0,18.0,70.0,0.0,15.0,15.0,0.0,0.0,5.0,7.0,8.0,10.0,10.0,8.0,7.0,10.0,7.0,10.0,9.0,3.0,10.0,1.0


In [189]:
list(data.columns)

['iid',
 'gender',
 'age',
 'field_cd',
 'attr1_1',
 'sinc1_1',
 'intel1_1',
 'fun1_1',
 'amb1_1',
 'shar1_1',
 'exercise',
 'hiking',
 'sports',
 'art',
 'museums',
 'reading',
 'shopping',
 'dining',
 'theater',
 'concerts',
 'movies',
 'tv',
 'music',
 'goal']

In [190]:
new_col = [
 'iid',
 'gender',
 'age',
 'field',
 'attr',
 'sinc',
 'intel',
 'fun',
 'amb',
 'shar',
 'exercise',
 'hiking',
 'sports',
 'art',
 'museums',
 'reading',
 'shopping',
 'dining',
 'theater',
 'concerts',
 'movies',
 'tv',
 'music',
 'goal'
]

In [191]:
data.columns = new_col

In [192]:
data_unique = data.drop_duplicates(subset="iid")

In [193]:
len(data_unique)

551

In [194]:
data_unique.index = [i for i in range(len(data_unique))]

In [195]:
data_unique['gender'].value_counts()

Male      277
Female    274
Name: gender, dtype: Int64

In [196]:
data_unique

Unnamed: 0,iid,gender,age,field,attr,sinc,intel,fun,amb,shar,exercise,hiking,sports,art,museums,reading,shopping,dining,theater,concerts,movies,tv,music,goal
0,1,Female,21.0,1.0,15.0,20.0,20.0,15.0,15.0,15.0,8.0,5.0,9.0,1.0,1.0,6.0,8.0,9.0,1.0,10.0,10.0,9.0,9.0,2.0
1,2,Female,24.0,1.0,45.0,5.0,25.0,20.0,0.0,5.0,7.0,3.0,3.0,6.0,8.0,10.0,3.0,10.0,9.0,7.0,8.0,1.0,8.0,1.0
2,3,Female,25.0,2.0,35.0,10.0,35.0,10.0,10.0,0.0,7.0,8.0,3.0,5.0,5.0,7.0,8.0,8.0,7.0,7.0,7.0,8.0,5.0,6.0
3,4,Female,23.0,1.0,20.0,20.0,20.0,20.0,10.0,10.0,6.0,7.0,1.0,7.0,6.0,7.0,1.0,7.0,9.0,8.0,7.0,7.0,7.0,1.0
4,5,Female,21.0,1.0,20.0,5.0,25.0,25.0,10.0,15.0,7.0,6.0,7.0,8.0,6.0,6.0,8.0,7.0,6.0,3.0,6.0,8.0,7.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,548,Male,30.0,8.0,40.0,10.0,20.0,10.0,0.0,20.0,4.0,2.0,10.0,2.0,3.0,9.0,4.0,10.0,4.0,7.0,6.0,6.0,10.0,1.0
547,549,Male,28.0,8.0,20.0,20.0,20.0,20.0,0.0,20.0,6.0,4.0,8.0,7.0,7.0,7.0,8.0,9.0,8.0,10.0,8.0,7.0,10.0,1.0
548,550,Male,30.0,8.0,30.0,3.0,30.0,30.0,3.0,4.0,5.0,3.0,5.0,8.0,7.0,8.0,6.0,9.0,7.0,4.0,7.0,3.0,6.0,2.0
549,551,Male,27.0,8.0,40.0,20.0,20.0,20.0,0.0,0.0,7.0,2.0,6.0,3.0,7.0,10.0,7.0,6.0,5.0,7.0,6.0,2.0,7.0,1.0


In [199]:
data_unique = data_unique.dropna()

In [200]:
data_unique.isnull().sum()

iid         0
gender      0
age         0
field       0
attr        0
sinc        0
intel       0
fun         0
amb         0
shar        0
exercise    0
hiking      0
sports      0
art         0
museums     0
reading     0
shopping    0
dining      0
theater     0
concerts    0
movies      0
tv          0
music       0
goal        0
dtype: int64

In [201]:
female_list = list(data_unique[data_unique['gender'] == "Female"]['iid'])
male_list = list(data_unique[data_unique['gender'] == "Male"]['iid'])

In [202]:
import random
female_list = random.choices(female_list, k=50)
male_list = random.choices(male_list, k = 50)

In [203]:
total_list = female_list + male_list

In [204]:
len(total_list)

100

In [205]:
print(len(female_list))
print(len(male_list))

50
50


In [206]:
data_unique = data_unique.loc[data_unique['iid'].isin(total_list)]

In [208]:
data_unique.to_csv('./tmi_temp_user_data.csv', sep=',', na_rep='NaN')