# SPEED DATING EXPERIMENT

### DS-GA-23 Final Project
#### Miranda Remmer


****

In [1]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, feature_selection, neighbors, metrics, grid_search, cross_validation

%run 'functions_datingexp.ipynb'

%matplotlib inline
plt.style.use('ggplot')

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 20)

In [2]:
df_raw = pd.read_csv(os.path.join('..', 'CODE', 'speed-dating-experiment', 'Speed Dating Data.csv'))

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


Look at the data to check the column names:

In [3]:
df_raw.columns

Index([u'iid', u'id', u'gender', u'idg', u'condtn', u'wave', u'round',
       u'position', u'positin1', u'order',
       ...
       u'attr3_3', u'sinc3_3', u'intel3_3', u'fun3_3', u'amb3_3', u'attr5_3',
       u'sinc5_3', u'intel5_3', u'fun5_3', u'amb5_3'],
      dtype='object', length=195)

In [4]:
df_raw

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,10.0,5,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,21,22,13,10.0,4,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,21,22,19,10.0,10,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,21,22,3,10.0,16,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0


### Creating New Dataset with Desired Columns


In [5]:
subset_df = df_raw[['iid', 'pid', 'gender', 'age', 'round',
                    'match', 'dec', 'dec_o', 
                    'exphappy', 'expnum', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                    'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 
                    'attr3_2', 'sinc3_2', 'fun3_2', 'intel3_2', 'amb3_2', 
                    'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1', 
                    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2', 
                    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s',
                    'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o', 'fun_o','amb_o','shar_o']]
subset_df

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7.0,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10.0,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,526.0,1,25.0,22,0,0,1,10.0,,...,5.0,5.0,,,10.0,5.0,3.0,2.0,6.0,5.0
8374,552,527.0,1,25.0,22,0,0,0,10.0,,...,8.0,4.0,4.0,,6.0,3.0,7.0,3.0,7.0,2.0
8375,552,528.0,1,25.0,22,0,0,0,10.0,,...,8.0,8.0,8.0,,2.0,1.0,2.0,2.0,2.0,1.0
8376,552,529.0,1,25.0,22,0,0,1,10.0,,...,5.0,4.0,,5.0,5.0,7.0,5.0,5.0,3.0,6.0


In [6]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 52 columns):
iid         8378 non-null int64
pid         8368 non-null float64
gender      8378 non-null int64
age         8283 non-null float64
round       8378 non-null int64
match       8378 non-null int64
dec         8378 non-null int64
dec_o       8378 non-null int64
exphappy    8277 non-null float64
expnum      1800 non-null float64
match_es    7205 non-null float64
like        8138 non-null float64
prob        8069 non-null float64
like_o      8128 non-null float64
prob_o      8060 non-null float64
attr3_1     8273 non-null float64
sinc3_1     8273 non-null float64
fun3_1      8273 non-null float64
intel3_1    8273 non-null float64
amb3_1      8273 non-null float64
attr3_2     7463 non-null float64
sinc3_2     7463 non-null float64
fun3_2      7463 non-null float64
intel3_2    7463 non-null float64
amb3_2      7463 non-null float64
attr5_1     4906 non-null float64
sinc5_1     4906 non-

## Clean Up Data

- Drop rows with irrelevant data
- Insert missing data where relevant 


### Cleaning up data: PART 1: Remove rows with NaN values

In [7]:
# Create new DF for cleaned data
subset_df_clean = subset_df[:]

#### Remove any data where subject or their partner didn't particpate during experiment.  

I.e. any values where subject didn't rate partner & partner didn't rate subject. 
In the instances where subject rated partner but partner didn't rate subject, or vice versa, possible conclusion that person in question didn't want to rate on a low scale.  With such cases, fill NaN values with 0.  

In [8]:
df_nan_exp_ratings_all = subset_df_clean[(subset_df_clean.attr_o.isnull()) & (subset_df_clean.sinc_o.isnull()) & (subset_df_clean.fun_o.isnull()) &
                   (subset_df_clean.intel_o.isnull()) &  (subset_df_clean.amb_o.isnull()) & (subset_df_clean.shar_o.isnull()) & (subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) & (subset_df_clean.fun.isnull()) &
                   (subset_df_clean.intel.isnull()) &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull())]

len(df_nan_exp_ratings_all)

132

In [9]:
len(subset_df_clean) #test

8378

In [10]:
# Dropping Data
subset_df_clean = dropData(subset_df_clean, df_nan_exp_ratings_all)

In [11]:
len(subset_df_clean) #test

8246

#### Remove any data where participant didn't fill out survey questions pertaining to test question:

In [12]:
#create sub-df to pull any data with missing values for features 3_1; 3_1; 5_1; 5_2
df_viewself_nan = subset_df_clean[(subset_df_clean.attr3_1.isnull()) & (subset_df_clean.sinc3_1.isnull()) & 
                                  (subset_df_clean.intel3_1.isnull()) & (subset_df_clean.attr3_2.isnull()) &
                                  (subset_df_clean.amb5_1.isnull()) & (subset_df_clean.attr5_2.isnull()) & 
                                  (subset_df_clean.attr5_1.isnull())] 
                                  

len(df_viewself_nan)

44

In [13]:
#view data
df_viewself_nan[['iid', 'attr', 'attr_o', 'attr5_1', 'fun3_2', 'match_es', 'attr3_s']]

Unnamed: 0,iid,attr,attr_o,attr5_1,fun3_2,match_es,attr3_s
312,28,3.0,8.0,,,3.0,
313,28,2.0,8.0,,,3.0,
314,28,3.0,7.0,,,3.0,
315,28,4.0,5.0,,,3.0,
316,28,4.0,5.0,,,3.0,
...,...,...,...,...,...,...,...
6405,414,10.0,6.0,,,2.0,8.0
6406,414,10.0,7.0,,,2.0,8.0
6407,414,7.0,4.0,,,2.0,8.0
6408,414,7.0,7.0,,,2.0,8.0


In [14]:
# Dropping Data
subset_df_clean =dropData(subset_df_clean, df_viewself_nan) 

In [15]:
len(subset_df_clean) #test

8202

In [16]:
#create sub-df to pull any data with missing values for features 3_s
df_viewself_nan2 = subset_df_clean[(subset_df_clean.attr3_s.isnull()) & (subset_df_clean.sinc3_s.isnull()) & 
                                  (subset_df_clean.intel3_s.isnull()) & (subset_df_clean.fun3_s.isnull()) &
                                  (subset_df_clean.amb3_s.isnull())] 
                                  

len(df_viewself_nan2)  #choosing not to drop b/c too many observations

4239

In [17]:
#test for NaN values with 'match'
df_match_nan = subset_df_clean[(subset_df_clean.match.isnull())]
len(df_match_nan)

0

In [18]:
#Grabing subject data with no scores for partner (including  like & prob) but subject chose 'yes' 
#(partner has scored subject)
df_nan_atr_like_prob_dec1 = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) &
                            (subset_df_clean.like.isnull()) & (subset_df_clean.prob.isnull()) 
                            & (subset_df_clean.dec == 1)]
len(df_nan_atr_like_prob_dec1) 

1

In [19]:
len(subset_df_clean) #test

8202

In [20]:
# Dropping data from DF
subset_df_clean = dropData(subset_df_clean, df_nan_atr_like_prob_dec1)

In [21]:
len(subset_df_clean) #test

8201

In [22]:
#Grabing subject data with no scores for partner but has a score for like 
#(determine if data is irrelvant & should be dropped or if attribute scores for partner should be filled in)
#(partner has scored subject)
df_nan_atr_likeV = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) &
                            (subset_df_clean.like.notnull())]
len(df_nan_atr_likeV) 

5

In [23]:
df_nan_atr_likeV[['iid', 'pid', 'match', 'dec', 'dec_o', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                 'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o','attr5_1', 'attr5_2', 'intel5_2', 'attr3_s']]

Unnamed: 0,iid,pid,match,dec,dec_o,match_es,like,prob,like_o,prob_o,...,fun,amb,shar,attr_o,sinc_o,intel_o,attr5_1,attr5_2,intel5_2,attr3_s
705,50,32.0,0,1,0,5.0,7.0,7.0,6.0,3.0,...,,,,7.0,8.0,8.0,,,,
711,50,38.0,1,1,1,5.0,7.0,8.0,7.0,6.0,...,,,,8.0,8.0,8.0,,,,
712,50,39.0,0,0,1,5.0,8.0,,7.0,5.0,...,,,,5.0,6.0,8.0,,,,
2546,187,182.0,0,0,0,,5.0,6.0,9.0,8.0,...,,,,6.0,8.0,9.0,,,,9.0
7639,519,540.0,0,0,0,0.5,6.0,8.0,5.0,3.0,...,,,,6.0,7.0,6.0,8.0,,,


**Observations:** Data in first 4 rows looks like it should get dropped as those subject's didn't include any data for survey data re. their own attributes. 

Row 5, however, [index 7639] is interesting b/c person rated themselves higher on attraction with a higher prob partner would select them; while partner rated them lower for attr and a lower like score, thus resulting in no match. 

In [24]:
# Dropping the above data
subset_df_clean =dropData(subset_df_clean, df_nan_atr_likeV)

In [25]:
##Look at same values but for partner (i.e. coded with _o at end)
#Partner didnt rate any attributes of subject but said 'yes'

df_nan_OTHER_atr_like_prob_dec1 = subset_df_clean[(subset_df_clean.attr_o.isnull()) 
                            & (subset_df_clean.sinc_o.isnull()) 
                            & (subset_df_clean.fun_o.isnull()) & (subset_df_clean.intel_o.isnull()) 
                             &  (subset_df_clean.amb_o.isnull()) & (subset_df_clean.shar_o.isnull()) &
                            (subset_df_clean.like_o.isnull()) & (subset_df_clean.prob_o.isnull()) 
                            & (subset_df_clean.dec_o == 1)]
len(df_nan_OTHER_atr_like_prob_dec1) 

1

In [26]:
# View Data
df_nan_OTHER_atr_like_prob_dec1

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
7033,476,488.0,0,25.0,15,0,0,1,5.0,,...,6.0,5.0,4.0,3.0,,,,,,


In [27]:
subset_df_clean = dropData(subset_df_clean, df_nan_OTHER_atr_like_prob_dec1)

In [28]:
#Grabing subject data with no scores for partner but has a score for prob 
#(determine if data is irrelvant & should be dropped or if attribute scores for partner should be filled in)
#(partner has scored subject)
df_nan_atr_probV = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) &
                            (subset_df_clean.prob.notnull())]
len(df_nan_atr_probV) 

0

In [29]:
#Checking data for feature 5_1 & 5_2
df_nan_5 = subset_df_clean[(subset_df_clean.attr5_1.isnull()) & (subset_df_clean.sinc5_1.isnull()) 
                            & (subset_df_clean.fun5_1.isnull()) & (subset_df_clean.intel5_1.isnull()) 
                             &  (subset_df_clean.amb5_1.isnull()) & (subset_df_clean.attr5_2.isnull())
                           & (subset_df_clean.sinc5_2.isnull())  & (subset_df_clean.fun5_2.isnull()) 
                           & (subset_df_clean.intel5_2.isnull()) 
                             &  (subset_df_clean.amb5_2.isnull())]
len(df_nan_5) ##not removing, too many observations

3364

### Cleaning Data: Part 2 - Adding '0' to NaN Values

In [34]:
#creating seperate DF where NaN values will be re-added
subset_df_clean_edit = subset_df_clean[:]
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7.0,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10.0,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,526.0,1,25.0,22,0,0,1,10.0,,...,5.0,5.0,,,10.0,5.0,3.0,2.0,6.0,5.0
8374,552,527.0,1,25.0,22,0,0,0,10.0,,...,8.0,4.0,4.0,,6.0,3.0,7.0,3.0,7.0,2.0
8375,552,528.0,1,25.0,22,0,0,0,10.0,,...,8.0,8.0,8.0,,2.0,1.0,2.0,2.0,2.0,1.0
8376,552,529.0,1,25.0,22,0,0,1,10.0,,...,5.0,4.0,,5.0,5.0,7.0,5.0,5.0,3.0,6.0


### Look at Feature SETS:

#### Look at Feature Set of attribute rating_o (ratings of subject by partner)

In [35]:
#runing above function to see if values don't line up with attr_o nan and since_o nan

checkMissing('attr_o', 'sinc_o',subset_df_clean_edit)

True


In [36]:
#pulling data for NaN values for partner rating of subject (all attribute_o ratings that are blank)
df_atr_o_null = subset_df_clean_edit[(subset_df_clean_edit.attr_o.isnull()) & (subset_df_clean_edit.sinc_o.isnull()) 
                            & (subset_df_clean_edit.fun_o.isnull()) & (subset_df_clean_edit.intel_o.isnull()) 
                             &  (subset_df_clean_edit.amb_o.isnull()) & (subset_df_clean_edit.shar_o.isnull())]

df_atr_o_null

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
386,32,50.0,0,27.0,16,0,0,1,3.0,0.0,...,8.0,9.0,8.0,4.0,,,,,,
482,38,50.0,0,23.0,16,1,1,1,2.0,12.0,...,8.0,9.0,9.0,7.0,,,,,,
498,39,50.0,0,24.0,16,0,1,0,2.0,5.0,...,8.0,10.0,8.0,,,,,,,
739,52,28.0,1,21.0,19,0,0,0,5.0,1.0,...,7.0,5.0,6.0,5.0,,,,,,
753,53,23.0,1,28.0,19,0,1,0,6.0,9.0,...,8.0,7.0,8.0,6.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8102,540,519.0,1,24.0,22,0,0,0,7.0,,...,6.0,6.0,5.0,2.0,,,,,,
8192,544,521.0,1,23.0,22,0,1,0,5.0,,...,8.0,7.0,7.0,6.0,,,,,,
8298,549,517.0,1,28.0,22,0,0,0,5.0,,...,0.0,0.0,0.0,0.0,,,,,,
8302,549,521.0,1,28.0,22,0,1,0,5.0,,...,8.0,7.0,8.0,7.0,,,,,,


In [37]:
#testing cleanDF function on attr_o (uses df_atr_o_nul)
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'attr_o')
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1926,143,172.0,0,33.0,16,0,1,0,6.0,,...,9.0,8.0,8.0,,0,8.0,9.0,2.0,8.0,1.0
2471,180,187.0,0,24.0,10,0,0,1,6.0,,...,5.0,5.0,5.0,4.0,0,,7.0,7.0,,
4773,316,307.0,1,25.0,14,1,1,1,5.0,,...,7.0,6.0,6.0,4.0,0,7.0,7.0,8.0,7.0,7.0
7246,490,476.0,1,29.0,15,0,1,0,3.0,,...,8.0,8.0,7.0,6.0,0,5.0,8.0,6.0,7.0,3.0


In [38]:
#running function cleanDF on sinc_o 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'sinc_o')

In [39]:
#running function cleanDF on intl_o 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'intel_o')

In [40]:
#running function cleanDF on fun_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'fun_o')

In [41]:
#running function cleanDF on amb_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'amb_o')

In [42]:
#running function cleanDF on shar_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'shar_o')

In [43]:
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6,8,8,8,8,6
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7,8,10,7,7,5
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10,10,10,10,10,10
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7,8,9,8,9,8
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8,7,9,6,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,219,201.0,1,26.0,20,0,0,0,7.0,,...,6.0,5.0,5.0,6.0,7,0,0,0,0,0
3604,253,286.0,0,33.0,21,0,0,0,7.0,,...,6.0,6.0,7.0,5.0,6,0,0,0,0,0
5470,365,347.0,1,30.0,20,0,0,1,8.0,,...,8.0,4.0,3.0,3.0,9,0,0,0,0,0
6306,409,383.0,1,23.0,18,0,0,0,7.0,,...,6.0,5.0,6.0,3.0,1,0,0,0,0,0


In [44]:
# recheck after adding 0 values to NaN
checkMissing('attr_o', 'sinc_o',subset_df_clean_edit)

False


In [55]:
checkMissing('amb_o', 'shar_o',subset_df_clean_edit)

False


#### Look at Feature Set of attribute ratings (ratings of partner by subject)

In [45]:
# check len of missing data in feature set of attributes of partner by subject
checkMissing('attr', 'sinc',subset_df_clean_edit)

True


In [46]:
#Grabing subject data with no scores for partner but partner scored subject
df_atr_null = subset_df_clean_edit[(subset_df_clean_edit.attr.isnull()) & (subset_df_clean_edit.sinc.isnull()) 
                            & (subset_df_clean_edit.fun.isnull()) & (subset_df_clean_edit.intel.isnull()) 
                             &  (subset_df_clean_edit.amb.isnull()) & (subset_df_clean_edit.shar.isnull()) ]
len(df_atr_null) 

52

In [47]:
#running function cleanDF on attr
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'attr')

In [48]:
#running function cleanDF on sinc 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'sinc')

In [49]:
#running function cleanDF on intl
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'intel')

In [50]:
#running function cleanDF on fun
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'fun')

In [51]:
#running function cleanDF on amb
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'amb')

In [56]:
#running function cleanDF on shar
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'shar')

In [57]:
# Recheck
checkMissing('attr', 'sinc',subset_df_clean_edit)

False


In [58]:
checkMissing('intel', 'fun',subset_df_clean_edit)

False


#### Look at 5_1 Feature Set

In [None]:
def checkAllMissing (df, feature_set):
    list = []
    for feature in feature_set: 
        if (len(df[feature].dropna())) != (len(df[feature].dropna())):
            list.append(feature)
            feature +=1
        else: 
            feature +=1 
    

In [None]:
feature_set = ['attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1']
checkAllMissing(subset_df_clean_edit, feature_set)

In [70]:
# Check  (went through all pairs of 5_1 attributes; all returned 'false')
checkMissing('fun5_1', 'amb5_1',subset_df_clean)

False


#### Look at 5_2 Feature Set

In [71]:
checkMissing('attr5_2', 'sinc5_2',subset_df_clean)

False


In [72]:
checkMissing('attr5_2', 'intel5_2',subset_df_clean)

False


In [73]:
checkMissing('attr5_2', 'fun5_2',subset_df_clean)

False


In [74]:
checkMissing('attr5_2', 'amb5_2',subset_df_clean)

False


In [71]:
checkMissing('attr5_2', 'sinc5_2',subset_df_clean)

False


In [72]:
checkMissing('attr5_2', 'intel5_2',subset_df_clean)

False


In [73]:
checkMissing('attr5_2', 'fun5_2',subset_df_clean)

False


In [74]:
checkMissing('attr5_2', 'amb5_2',subset_df_clean)

False


In [None]:
#Grabing subject data with no scores for all of 5_1
df_5_1_null = subset_df_clean_edit[(subset_df_clean_edit.attr5_1.isnull()) & (subset_df_clean_edit.sinc5_1.isnull()) 
                            & (subset_df_clean_edit.fun5_1.isnull()) & (subset_df_clean_edit.intel5_1.isnull()) 
                             &  (subset_df_clean_edit.amb5_1.isnull())]
len(df_5_1_null) 

In [None]:
len(subset_df_clean_edit.sinc5_1 == 0)  #test

In [None]:
#running function cleanDF on sinc5_1
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_5_1_null , 'sinc5_1')

In [None]:
len(subset_df_clean_edit.sinc5_1.dropna())  #test

In [None]:
len(subset_df_clean_edit.sinc5_1 == 0) 

In [None]:
subset_df_clean_edit.sinc5_1.value_counts() 

In [30]:
checkMissing('attr5_2', 'sinc5_2',subset_df_clean)

False


In [31]:
checkMissing('attr3_s', 'sinc3_s',subset_df_clean)

False


In [32]:
checkMissing('attr3_1', 'sinc3_1',subset_df_clean)

False


In [33]:
checkMissing('attr3_2', 'sinc3_2',subset_df_clean)

False


In [None]:
subset_df_clean_edit

#### Look at individual features:

In [None]:
# Looking at data where 'like' is NaN and dec=0 
#(here like can be changed to 0)
like_nan = subset_df_clean_edit[subset_df_clean_edit.like.isnull()]
like_nan_dec0 = like_nan[like_nan.dec == 0]

# View Data
like_nan_dec0[['iid', 'pid', 'like', 'like_o', 'dec', 'attr', 'attr5_1', 'attr5_2', 'attr3_s', 'attr3_1', 'attr3_2', 'attr_o', 'dec', 'dec_o']]

In [None]:
subset_df_clean_edit = cleanFeatDF('like', subset_df_clean_edit, like_nan_dec0)

In [None]:
##fillNaN('like', like_nan_dec0)
like_nan_dec0[['like']]

In [None]:
# TEST
iid245 = subset_df_clean_edit.loc[245]
iid245.like

In [None]:
# Looking at data where 'like_o' is NaN and 'dec_o' =0 
#(here like_o can be changed to 0)
like_o_nan = subset_df_clean_edit[subset_df_clean_edit.like_o.isnull()]
like_o_nan_deco0 = like_o_nan[like_o_nan.dec_o == 0]

# View data
like_o_nan_deco0[['iid', 'pid', 'like', 'like_o','dec', 'attr', 'attr5_1', 'attr5_2', 'attr3_s', 'attr3_1', 'attr3_2', 'attr_o', 'dec', 'dec_o']]

In [None]:
subset_df_clean_edit = cleanFeatDF('like_o', subset_df_clean_edit, like_o_nan_dec0)

#### Looking at null values for match_es; esphappy; expnum

In [None]:
# Look at match_est
#(here like_o can be changed to 0)
match_es_nan = subset_df_clean_edit[subset_df_clean_edit.match_es.isnull()]
match_es_nan_dec1 = match_es_nan[match_es_nan.dec== 0]

# View data
match_es_nan_dec1[['iid', 'pid', 'match_es', 'like', 'like_o','dec', 'attr', 'attr5_1', 'attr5_2', 'attr3_s', 'attr3_1', 'attr3_2', 'attr_o', 'dec', 'dec_o']]

In [None]:
# Look at data where exphappy and expnum are NaN

exphappy_expnum_nan = subset_df_clean_edit[(subset_df_clean_edit.exphappy.isnull()) & (subset_df_clean_edit.expnum.isnull())]
exphappy_expnum_nan[['iid', 'attr5_1', 'intel5_1', 'attr5_2', 'intel5_2', 'attr3_1', 'intel3_1', 'attr3_2', 'intel3_2', 'attr3_s', 'intel3_s']]

### Recounting 'MET' aka 'Round'


In [None]:
recountMET(subset_df_clean_edit)

In [None]:
subset_df_clean_edit

*******

## Q: Does one’s perception of themselves predict their dating outcomes? 

- Does this differ by gender? 
- Does this differ by age?

**Hypothesis**: people who have lower self esteem (i.e. negatively evaluate themselves by giving themselves lower scores on the attribute scale) will get less dates/matches; while those who give themselves higher ratings will get more.  Women are more likely to give themselves more critical ratings than men, thus negatively affecting their outcome. 

Look at how people view/score themselves alongside how others score them. 



##### General Variable KEY:

| Variable | Description |
| ---| ---|
|attr | Attractive|
|sinc |Sincere  |
|intel | Intelligent|
| fun | Fun|
| amb | Ambitious|
| shar |Shared Interests/Hobbies

***Each feature has a code at the end of the variable which references the survey question and when in the experiment the question was being asked*** (signup, during dating expirement, after dating experiment)

| Feature CODE | Scale | When during Experiment? |Question| 
| :------:| :------:| :------: |:------|
|**oPercveMe_1**| 1-10| Signup|How do you think others perceive you? |
|**oPercveMe_2**| 1-10| After event|How do you think others perceive you? |
|**iRateMe_exp**| 1-10| During event|Rate your opinion of your own attributes  |
|  **iMeasUp_1**| 1-10 | Signup| Based on what you think the opposite sex looks for in a date, how do you think you measure up?
|**iMeasUp_2**| 1-10| After event| Based on what you think the opposite sex looks for in a date, how do you think you measure up?
|  **attr; shar**| 1-10 | During event (after each date)| Subject's rating of parter |
|**attr_o; shar_o**| 1-10 | During event (after each date)| Partner's rating of subject|


| Feature | Scale | When during Experiment? |Question/Description| 
| :------:| :------:| :------: |:------|
|**exphappy** | 1-10 | Signup survey| Overall, on a scale of 1-10, how happy do you expect to be with the people you meet during the speed-dating event? |
|**expnum** | 0-20ppl |Signup survey | Out of the 20 people you will meet, how many do you expect will be interested in dating you?|
|**match_es** | *changes based on met_count* | End of experiment|  How many matches do you estimate you will get (a match occurs when you and your partner both check “Yes” next to decision)?|
| **dec** | 1=yes, 0=no | After each date round | Decision|
|**dec_o**|  1=yes, 0=no| After each date round | Decision of partner| 


**met_count**: number of people that subject met with during experiement

**match**:	1=yes | 0=no *determined after dating event if both subject and parter selected 'yes' under 'dec' on their scorecard*



**iid**: unique number for each subject

**gender**: 1=M | 0=F


****

Renaming the following features:

- round | met_count
- *for variables attr, sinc, intel, fun, amb*:
    - 3_1 | iMeasUp_1
    - 3_2 | iMeasUp_2
    - 5_1 | oPercveMe_1
    - 5_2 | oPercveMe_2
    - 3_s | iRateMe_exp


In [None]:
renameFeature('round', 'met_count', subset_df_clean_edit)

renameFeatures("3_1", "_iMeasUp_1", subset_df_clean_edit)
renameFeatures("3_2", "_iMeasUp_2", subset_df_clean_edit)
renameFeatures("5_1", "_oPercveMe_1", subset_df_clean_edit)
renameFeatures("5_2", "_oPercveMe_2", subset_df_clean_edit)
renameFeatures("3_s", "_iRateMe_exp", subset_df_clean_edit)


subset_df_clean_edit.columns  #to validate the output

******

# Compress Features Within Dataset to Get Averages & Sums

### New Feature Info:

> ####  new feature info here


>'Yes' Sum (#of decision = yes):

> dec_o sum = sum of 'yes' per men for women

****

### Seperating Dataset into Two datasets: 1 Female; 1 Male

In [None]:
df_female_raw = subset_df_clean_edit[subset_df_clean_edit.gender == 0]
df_female_raw

In [None]:
df_male_raw = subset_df_clean_edit[subset_df_clean_edit.gender == 1]
df_male_raw

### Drop and of the below NaN Values ??

In [None]:
len(df_female_raw.expnum.dropna())

In [None]:
df_female_raw.expnum.isnull()

In [None]:
test = pd.DataFrame()
test['iid'] = df_female_raw.iid.unique()
test

In [None]:
test['expnum'] = getValueSet('expnum', df_female_raw)
test

In [None]:
len(test.expnum.dropna())

In [None]:
test['gender'] = getValueSet('gender', df_female_raw)
test

### *Female Dataset*

In [None]:
#create empty DF
df_female_condensed = pd.DataFrame()

In [None]:
#Load condensed data into new df
ConvertDF(df_female_raw,df_female_condensed)

### *Male Dataset*

Create empty DF:

In [None]:
#create empty DF
df_male_condensed = pd.DataFrame()

In [None]:
#Load condensed data into new df
ConvertDF(df_male_raw,df_male_condensed)

## Set index to iid

In [None]:
#df = df.set_index('iid')