# SPEED DATING EXPERIMENT

### DS-GA-23 Final Project
#### Miranda Remmer


****

In [59]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, feature_selection, neighbors, metrics, grid_search, cross_validation


%matplotlib inline
plt.style.use('ggplot')

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 20)

In [2]:
df_raw = pd.read_csv(os.path.join('..', 'CODE', 'speed-dating-experiment', 'Speed Dating Data.csv'))

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [3]:
df_raw.columns

Index([u'iid', u'id', u'gender', u'idg', u'condtn', u'wave', u'round',
       u'position', u'positin1', u'order',
       ...
       u'attr3_3', u'sinc3_3', u'intel3_3', u'fun3_3', u'amb3_3', u'attr5_3',
       u'sinc5_3', u'intel5_3', u'fun5_3', u'amb5_3'],
      dtype='object', length=195)

In [4]:
df_raw

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,10.0,5,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,21,22,13,10.0,4,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,21,22,19,10.0,10,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,21,22,3,10.0,16,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0


*******

## Creating New Dataset with Desired Columns


In [5]:
subset_df = df_raw[['iid', 'gender', 'round', 'wave','pid', 
                    'samerace', 'imprace', 'imprelig', 
                    'age', 'age_o', 
                    'match', 'dec', 'dec_o', 'condtn', 
                    'exphappy', 'expnum', 'goal', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                    'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o', 'fun_o','amb_o','shar_o', 
                    'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 
                    'attr3_2', 'sinc3_2', 'fun3_2', 'intel3_2', 'amb3_2', 
                    'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1', 
                    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2', 
                    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s', 
                    'attr1_1', 'sinc1_1', 'fun1_1', 'intel1_1', 'amb1_1', 'shar1_1', 
                    'attr1_2', 'sinc1_2', 'fun1_2', 'intel1_2', 'amb1_2', 'shar1_2', 
                    'pf_o_att','pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha', 
                    'attr1_s', 'sinc1_s', 'fun1_s', 'intel1_s', 'amb1_s', 'shar1_s', 
                    'attr7_2', 'sinc7_2', 'fun7_2', 'intel7_2', 'amb7_2', 'shar7_2', 
                    'attr4_1', 'sinc4_1', 'fun4_1', 'intel4_1', 'amb4_1', 'shar4_1', 
                    'attr4_2', 'sinc4_2', 'fun4_2', 'intel4_2', 'amb4_2', 'shar4_2', 
                    'attr2_1', 'sinc2_1', 'fun2_1', 'intel2_1', 'amb2_1', 'shar2_1', 
                    'attr2_2', 'sinc2_2', 'fun2_2', 'intel2_2', 'amb2_2', 'shar2_2']]

subset_df

Unnamed: 0,iid,gender,round,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
0,1,0,10,1,11.0,0,2.0,4.0,21.0,27.0,...,20.0,15.0,5.0,5.0,,,,,,
1,1,0,10,1,12.0,0,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
2,1,0,10,1,13.0,1,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
3,1,0,10,1,14.0,0,2.0,4.0,21.0,23.0,...,20.0,15.0,5.0,5.0,,,,,,
4,1,0,10,1,15.0,0,2.0,4.0,21.0,24.0,...,20.0,15.0,5.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,22,21,526.0,0,1.0,1.0,25.0,26.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8374,552,1,22,21,527.0,0,1.0,1.0,25.0,24.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8375,552,1,22,21,528.0,0,1.0,1.0,25.0,29.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8376,552,1,22,21,529.0,0,1.0,1.0,25.0,22.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0


In [6]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 113 entries, iid to shar2_2
dtypes: float64(104), int64(9)
memory usage: 7.2 MB


### Renaming Features:

In [7]:
def renameFeature(feature, new_feature_name, df):
    df.rename(columns={feature: new_feature_name}, inplace = True)

In [8]:
#renaming 'round' to 'met_count'

renameFeature('round', 'met_count', subset_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


### Reassign 


In [9]:
#df = df.set_index('iid')

### Feature Info:

******

### Seperating Dataset into Two datasets: 1 Female; 1 Male

In [10]:
df_female_raw = subset_df[subset_df.gender == 0]
df_female_raw

Unnamed: 0,iid,gender,met_count,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
0,1,0,10,1,11.0,0,2.0,4.0,21.0,27.0,...,20.0,15.0,5.0,5.0,,,,,,
1,1,0,10,1,12.0,0,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
2,1,0,10,1,13.0,1,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
3,1,0,10,1,14.0,0,2.0,4.0,21.0,23.0,...,20.0,15.0,5.0,5.0,,,,,,
4,1,0,10,1,15.0,0,2.0,4.0,21.0,24.0,...,20.0,15.0,5.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7889,530,0,22,21,548.0,0,1.0,1.0,22.0,30.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7890,530,0,22,21,549.0,0,1.0,1.0,22.0,28.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7891,530,0,22,21,550.0,0,1.0,1.0,22.0,30.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7892,530,0,22,21,551.0,0,1.0,1.0,22.0,27.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0


In [11]:
df_male_raw = subset_df[subset_df.gender == 1]
df_male_raw

Unnamed: 0,iid,gender,met_count,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
100,11,1,10,1,1.0,0,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
101,11,1,10,1,2.0,1,7.0,3.0,27.0,24.0,...,20.0,20.0,25.0,5.0,,,,,,
102,11,1,10,1,3.0,1,7.0,3.0,27.0,25.0,...,20.0,20.0,25.0,5.0,,,,,,
103,11,1,10,1,4.0,1,7.0,3.0,27.0,23.0,...,20.0,20.0,25.0,5.0,,,,,,
104,11,1,10,1,5.0,1,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,22,21,526.0,0,1.0,1.0,25.0,26.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8374,552,1,22,21,527.0,0,1.0,1.0,25.0,24.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8375,552,1,22,21,528.0,0,1.0,1.0,25.0,29.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8376,552,1,22,21,529.0,0,1.0,1.0,25.0,22.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0


# Compress Features Within Dataset to Get Averages & Sums

### Function for Sums:

In [60]:
def getSum(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubet = df[df.iid == iid]
        if (iidsubset[feature].isnull().any()) == False: 
            sum = iidsubet[feature].sum()
            list.append(sum)
        else: 
            return np.NaN
    return pd.Series(list)


### Functions to Get Averages of a Feature Set:

Function to return **average rating score**:  
(divis by met_count)

DIFFERENT VALUE IN FEATURE DATASET PER IID #

In [131]:
def getAveScore(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().any()) == False:
            denom = iidsubset['met_count'].unique()[0]
            sum = iidsubset[feature].sum()
            divis = sum/denom
            list.append(divis)
        else: 
            return np.NaN
    return pd.Series(list)

In [14]:
#EXAMPLE

df_female_raw[['iid', 'attr_o', 'met_count']]
#get average scores of 'attr_o' for each iid 

Unnamed: 0,iid,attr_o,met_count
0,1,6.0,10
1,1,7.0,10
2,1,10.0,10
3,1,7.0,10
4,1,8.0,10
...,...,...,...
7889,530,1.0,22
7890,530,4.0,22
7891,530,4.0,22
7892,530,4.0,22


Function to return **general average** 
(divide by met_count)

SAME VALUE IN FEATURE DATASET PER IID #

In [132]:
def getAve(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().any()) == False:
            denom = iidsubset['met_count'].unique()[0]
            sum = iidsubset[feature].unique()[0]
            divis = sum/denom
            list.append(divis)
        else: 
            return np.NaN
    return pd.Series(list)

In [16]:
#EXAMPLE

df_female_raw[['iid', 'match_es', 'met_count']]
#want to get average of match_es (match_es/met_count)

Unnamed: 0,iid,match_es,met_count
0,1,4.0,10
1,1,4.0,10
2,1,4.0,10
3,1,4.0,10
4,1,4.0,10
...,...,...,...
7889,530,6.0,22
7890,530,6.0,22
7891,530,6.0,22
7892,530,6.0,22


### Function to Grab 1 Value of Feature Data for Each iid:

#### FOR AL FUNCTIONS: WRITE WHAT IT TAKES IN AND WHAT IT RETURNS 


EG. take in and return series 

In [126]:
def getValueSet(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().any()) == False:
            denom = len(iidsubset)
            sum = iidsubset[feature].sum()
            divis = sum/denom
            list.append(divis)
        else:
            return np.NaN
    return pd.Series(list)

#test: getValueSet(df_female_raw, 'met_count')

#### Function to return expnum_ave (out of 20 people) and exphappy_ave (out of 10 pts):

In [127]:
def expAve(feature, denom, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().any()) == False:
            f = iidsubset[feature].unique()[0]
            ave = f / denom
            list.append(ave)
        else:
            return np.NaN
    return pd.Series(list)

#EXPNUM_AVE (/20ppl)
#df_female_condensed['expnum_ave'] = expAve('expnum', 20, df_female_condensed)
#EXPHAPPY_AVE
#df['exphappy_ave'] = getAve('exphappy', 10, DF)  #EXPHAPPY_AVE (/10PTS) ##????

## TEST FOR NAN VALUES - NEED TO REMOVE

In [120]:
len(df_female_raw.expnum.dropna())

872

In [121]:
df_female_raw.expnum.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
7889     True
7890     True
7891     True
7892     True
7893     True
Name: expnum, dtype: bool

In [128]:
test = pd.DataFrame()
test['iid'] = df_female_raw.iid.unique()
test

Unnamed: 0,iid
0,1
1,2
2,3
3,4
4,5
...,...
269,526
270,527
271,528
272,529


In [129]:
test['expnum'] = getValueSet('expnum', df_female_raw)
test

Unnamed: 0,iid,expnum
0,1,
1,2,
2,3,
3,4,
4,5,
...,...,...
269,526,
270,527,
271,528,
272,529,


In [124]:
len(test.expnum.dropna())

0

In [130]:
test['gender'] = getValueSet('gender', df_female_raw)
test

Unnamed: 0,iid,expnum,gender
0,1,,0
1,2,,0
2,3,,0
3,4,,0
4,5,,0
...,...,...,...
269,526,,0
270,527,,0
271,528,,0
272,529,,0


### Function to Convert Values from Old DF to New Condensed DF:

In [57]:
def ConvertDF(DF, df):  #DF = old dataframe    #df = new data frame
    df['iid'] = DF.iid.unique()
    df['gender'] = getValueSet('gender', DF)
    df['met_count'] = getValueSet('met_count', DF)
    df['condtn'] = getValueSet('condtn', DF)
    df['exphappy'] = getValueSet('exphappy', DF)
    df['expnum'] = getValueSet('expnum', DF)
    df['match_es'] = getValueSet('match_es', DF)
    df['goal'] = getValueSet('goal', DF)
    
    
    #RENAME
    df['attr3_1'] = getValueSet('attr3_1', DF)
    df['sinc3_1'] = getValueSet('sinc3_1', DF)
    df['fun3_1'] = getValueSet('fun3_1', DF)
    df['intel3_1'] = getValueSet('intel3_1', DF)
    df['amb3_1']= getValueSet('amb3_1', DF)
    
    df['attr3_2'] = getValueSet('attr3_2', DF)
    df['sinc3_2'] = getValueSet('sinc3_2', DF)
    df['fun3_2'] = getValueSet('fun3_2', DF)
    df['intel3_2']= getValueSet('intel3_2', DF)
    df['amb3_2'] = getValueSet('amb3_2', DF)
    
    df['attr5_1'] = getValueSet('attr5_1', DF)
    df['sinc5_1'] = getValueSet('sinc5_1', DF)
    df['fun5_1'] = getValueSet('fun5_1', DF)
    df['intel5_1'] = getValueSet('intel5_1', DF)
    df['amb5_1'] = getValueSet('amb5_1', DF)
    
    df['attr5_2'] = getValueSet('attr5_2', DF)
    df['sinc5_2'] = getValueSet('sinc5_2', DF)
    df['fun5_2'] = getValueSet('fun5_2', DF)
    df['intel5_2'] = getValueSet('intel5_2', DF)
    df['amb5_2'] = getValueSet('amb5_2', DF)
    
    df['attr3_s'] = getValueSet('attr3_s', DF)
    df['sinc3_s'] = getValueSet('sinc3_s', DF)
    df['fun3_s'] = getValueSet('fun3_s', DF)
    df['intel3_s'] = getValueSet('intel3_s', DF)
    df['amb3_s'] = getValueSet('amb3_s', DF)
    
    df['attr1_1'] = getValueSet('attr1_1', DF)
    df['sinc1_1'] = getValueSet('sinc1_1', DF)
    df['fun1_1'] = getValueSet('fun1_1', DF)
    df['intel1_1'] = getValueSet('intel1_1', DF)
    df['amb1_1'] = getValueSet('amb1_1', DF)
    df['shar1_1'] = getValueSet('shar1_1', DF)
    
    df['attr1_s'] = getValueSet('attr1_s', DF)
    df['sinc1_s'] = getValueSet('sinc1_s', DF)
    df['fun1_s'] = getValueSet('fun1_s', DF)
    df['intel1_s'] = getValueSet('intel1_s', DF)
    df['amb1_s'] = getValueSet('amb1_s', DF)
    df['shar1_s'] = getValueSet('shar1_s', DF)
    
    df['attr1_2'] = getValueSet('attr1_2', DF)
    df['sinc1_2'] = getValueSet('sinc1_2', DF)
    df['fun1_2'] = getValueSet('fun1_2', DF)
    df['intel1_2'] = getValueSet('intel1_2', DF)
    df['amb1_2'] = getValueSet('amb1_2', DF)
    df['shar1_2'] = getValueSet('shar1_2', DF)
    
    df['attr7_2'] = getValueSet('attr7_2', DF)
    df['sinc7_2'] = getValueSet('sinc7_2', DF)
    df['fun7_2'] = getValueSet('fun7_2', DF)
    df['intel7_2'] = getValueSet('intel7_2', DF)
    df['amb7_2'] = getValueSet('amb7_2', DF)
    df['shar7_2'] = getValueSet('shar7_2', DF)
    
    df['attr4_1'] = getValueSet('attr4_1', DF)
    df['sinc4_1'] = getValueSet('sinc4_1', DF)
    df['fun4_1'] = getValueSet('fun4_1', DF)
    df['intel4_1'] = getValueSet('intel4_1', DF)
    df['amb4_1'] = getValueSet('amb4_1', DF)
    df['shar4_1'] = getValueSet('shar4_1', DF)
    
    df['attr4_2'] = getValueSet('attr4_2', DF)
    df['sinc4_2'] = getValueSet('sinc4_2', DF)
    df['fun4_2'] = getValueSet('fun4_2', DF)
    df['intel4_2'] = getValueSet('intel4_2', DF)
    df['amb4_2'] = getValueSet('amb4_2', DF)
    df['shar4_2'] = getValueSet('shar4_2', DF)
       
    df['attr2_1'] = getValueSet('attr2_1', DF)
    df['sinc2_1'] = getValueSet('sinc2_1', DF)
    df['fun2_1'] = getValueSet('fun2_1', DF)
    df['intel2_1'] = getValueSet('intel2_1', DF)
    df['amb2_1'] = getValueSet('amb2_1', DF)
    df['shar2_1'] = getValueSet('shar2_1', DF)
       
    df['attr2_2'] = getValueSet('attr2_2', DF)
    df['sinc2_2'] = getValueSet('sinc2_2', DF)
    df['fun2_2'] = getValueSet('fun2_2', DF)
    df['intel2_2'] = getValueSet('intel2_2', DF)
    df['amb2_2'] = getValueSet('amb2_2', DF)
    df['shar2_2'] = getValueSet('shar2_2', DF)
    
    df['match_sum'] = getSum('match', DF)
    df['dec_sum'] = getSum('dec', DF) #sum of subject's decisions (num of 'yes'')
    df['dec_o_sum'] = getSum('dec_o', DF) #sum of parnter's decisions (num of 'yes'')
    
    df['match_es_ave'] = getAve('match_es', DF)  #MATCH_ES_AVE  = % of people they think they'll match with
    df['like_ave'] = getAveScore('like', DF)  #LIKE_AVE ##?
    df['like_o_ave'] = getAveScore('like_o', DF)  #LIKE_O_AVE ##?
    df['prob_ave'] = getAveScore('prob', DF)  #PROB_AVE ##?
    df['prob_o_ave'] = getAveScore('prob_o', DF)  #PROB_O_AVE ##?
       
    df['dec_ave'] = getAveScore('dec', DF)  #DEC_AVE (average decision of subject to want to date)
    df['dec_o_ave'] = getAveScore('dec_o', DF)  #DEC_AVE (average dec of partners to want to date subject)
    df['match_ave'] = getAveScore('match', DF) #MATCH_AVE (average match count/#of people met with)
    
    df['attr_ave'] = getAveScore('attr', DF) 
    df['sinc_ave'] = getAveScore('sinc', DF)  
    df['intel_ave'] = getAveScore('intel', DF) 
    df['fun_ave'] = getAveScore('fun', DF) 
    df['amb_ave'] = getAveScore('amb',  DF) 
    df['shar_ave'] = getAveScore('shar', DF) 
    
    df['attr_o_ave'] = getAveScore('attr_o', DF) 
    df['sinc_o_ave'] = getAveScore('sinc_o', DF)  
    df['intel_o_ave'] = getAveScore('intel_o', DF) 
    df['fun_o_ave'] = getAveScore('fun_o', DF) 
    df['amb_o_ave'] = getAveScore('amb_o', DF) 
    df['shar_o_ave'] = getAveScore('shar_o', DF) 
    
    df['pf_o_att_ave'] = getAveScore('pf_o_att',DF) 
    df['pf_o_sin_ave'] = getAveScore('pf_o_sin', DF)  
    df['pf_o_int_ave'] = getAveScore('pf_o_int', DF) 
    df['pf_o_fun_ave'] = getAveScore('pf_o_fun', DF) 
    df['pf_o_amb_ave'] = getAveScore('pf_o_amb', DF) 
    df['pf_o_sha_ave'] = getAveScore('pf_o_sha', DF) 
    
    
    df['expnum_ave'] = expAve('expnum', 20, df)
    df['exphappy_ave'] = expAve('exphappy', 10, df)
    
    return df

### *Female Dataset*

> #### feature info here


>'Yes' Sum (#of decision = yes):

> dec_o sum = sum of 'yes' per men for women

Creating empty DF:

In [55]:
df_female_condensed = pd.DataFrame()

In [58]:
ConvertDF(df_female_raw,df_female_condensed)

Unnamed: 0,iid,gender,met_count,condtn,exphappy,expnum,match_es,goal,attr3_1,sinc3_1,...,amb_o_ave,shar_o_ave,pf_o_att_ave,pf_o_sin_ave,pf_o_int_ave,pf_o_fun_ave,pf_o_amb_ave,pf_o_sha_ave,expnum_ave,exphappy_ave
0,1,0,10,1,3.0,2.0,4.0,2.0,6.0,8.0,...,8.000000,7.100000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.10,0.3
1,2,0,10,1,4.0,5.0,3.0,1.0,7.0,5.0,...,7.500000,6.500000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.25,0.4
2,3,0,10,1,4.0,2.0,0.0,6.0,8.0,9.0,...,6.400000,5.400000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.10,0.4
3,4,0,10,1,1.0,2.0,2.0,1.0,7.0,8.0,...,7.700000,7.200000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.10,0.1
4,5,0,10,1,7.0,10.0,0.0,2.0,6.0,3.0,...,7.800000,6.200000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.50,0.7
5,6,0,10,1,4.0,3.0,4.0,1.0,5.0,7.0,...,8.000000,6.300000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.15,0.4
6,7,0,10,1,7.0,3.0,2.0,1.0,6.0,6.0,...,7.500000,6.700000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.15,0.7
7,8,0,10,1,6.0,10.0,0.0,1.0,7.0,4.0,...,6.700000,6.800000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.50,0.6
8,9,0,10,1,6.0,15.0,1.0,1.0,7.0,6.0,...,6.900000,6.300000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.75,0.6
9,10,0,10,1,7.0,10.0,0.0,2.0,6.0,8.0,...,6.200000,5.200000,44.233000,7.911000,16.511000,16.911000,4.511000,9.922000,0.50,0.7


### *Male Dataset*

Create empty DF:

In [22]:
df_male_condensed = pd.DataFrame()

In [23]:
ConvertDF(df_male_raw,df_male_condensed)

Unnamed: 0,iid,gender,met_count,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
100,11,1,10,1,1.0,0,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
101,11,1,10,1,2.0,1,7.0,3.0,27.0,24.0,...,20.0,20.0,25.0,5.0,,,,,,
102,11,1,10,1,3.0,1,7.0,3.0,27.0,25.0,...,20.0,20.0,25.0,5.0,,,,,,
103,11,1,10,1,4.0,1,7.0,3.0,27.0,23.0,...,20.0,20.0,25.0,5.0,,,,,,
104,11,1,10,1,5.0,1,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,22,21,526.0,0,1.0,1.0,25.0,26.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8374,552,1,22,21,527.0,0,1.0,1.0,25.0,24.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8375,552,1,22,21,528.0,0,1.0,1.0,25.0,29.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8376,552,1,22,21,529.0,0,1.0,1.0,25.0,22.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0


## Set index to iid

In [24]:
#df = df.set_index('iid')

## drop outliers? drop nan values? (for each compressed feature)?

EG:
    

len(subset_df.match_es)

len(subset_df.match_es.dropna())

subset_df.match_es.dropna(inplace = True) #removing NaN values

len(subset_df.match_es)

EG:

Q1 = df.Newspaper.quantile(0.25)
Q3 = df.Newspaper.quantile(0.75)

IQR = Q3 - Q1

IQR


df.drop(df[df.Newspaper > Q3 + 1.5 * IQR].index, inplace = True)

len(df)