# SPEED DATING EXPERIMENT

### DS-GA-23 Final Project
#### Miranda Remmer


****

In [1]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, feature_selection, neighbors, metrics, grid_search, cross_validation


%matplotlib inline
plt.style.use('ggplot')

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 20)

In [2]:
df_raw = pd.read_csv(os.path.join('..', 'CODE', 'speed-dating-experiment', 'Speed Dating Data.csv'))

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


Look at the data to check the column names:

In [3]:
df_raw.columns

Index([u'iid', u'id', u'gender', u'idg', u'condtn', u'wave', u'round',
       u'position', u'positin1', u'order',
       ...
       u'attr3_3', u'sinc3_3', u'intel3_3', u'fun3_3', u'amb3_3', u'attr5_3',
       u'sinc5_3', u'intel5_3', u'fun5_3', u'amb5_3'],
      dtype='object', length=195)

In [4]:
df_raw

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,10.0,5,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,21,22,13,10.0,4,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,21,22,19,10.0,10,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,21,22,3,10.0,16,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0


### Creating New Dataset with Desired Columns


In [5]:
subset_df = df_raw[['iid', 'pid', 'gender', 'age', 'round',
                    'match', 'dec', 'dec_o', 
                    'exphappy', 'expnum', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                    'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 
                    'attr3_2', 'sinc3_2', 'fun3_2', 'intel3_2', 'amb3_2', 
                    'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1', 
                    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2', 
                    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s',
                    'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o', 'fun_o','amb_o','shar_o']]
subset_df

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7.0,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10.0,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,526.0,1,25.0,22,0,0,1,10.0,,...,5.0,5.0,,,10.0,5.0,3.0,2.0,6.0,5.0
8374,552,527.0,1,25.0,22,0,0,0,10.0,,...,8.0,4.0,4.0,,6.0,3.0,7.0,3.0,7.0,2.0
8375,552,528.0,1,25.0,22,0,0,0,10.0,,...,8.0,8.0,8.0,,2.0,1.0,2.0,2.0,2.0,1.0
8376,552,529.0,1,25.0,22,0,0,1,10.0,,...,5.0,4.0,,5.0,5.0,7.0,5.0,5.0,3.0,6.0


In [6]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 52 columns):
iid         8378 non-null int64
pid         8368 non-null float64
gender      8378 non-null int64
age         8283 non-null float64
round       8378 non-null int64
match       8378 non-null int64
dec         8378 non-null int64
dec_o       8378 non-null int64
exphappy    8277 non-null float64
expnum      1800 non-null float64
match_es    7205 non-null float64
like        8138 non-null float64
prob        8069 non-null float64
like_o      8128 non-null float64
prob_o      8060 non-null float64
attr3_1     8273 non-null float64
sinc3_1     8273 non-null float64
fun3_1      8273 non-null float64
intel3_1    8273 non-null float64
amb3_1      8273 non-null float64
attr3_2     7463 non-null float64
sinc3_2     7463 non-null float64
fun3_2      7463 non-null float64
intel3_2    7463 non-null float64
amb3_2      7463 non-null float64
attr5_1     4906 non-null float64
sinc5_1     4906 non-

## Clean Up Data

- Drop rows with irrelevant data
- Insert missing data where relevant 


#### Function to insert missing data to new df (fill in NaN values with 0)

In [7]:
#function to replace any NaN values with 0 where desired (takes in series; returns series)
def fillNaN(feature, df):
    df[feature] = df[feature].replace([np.nan], '0')

#### Remove any data where subject or their partner didn't particpate during experiment.  

I.e. any values where subject didn't rate partner & partner didn't rate subject. 
In the instances where subject rated partner but partner didn't rate subject, or vice versa, possible conclusion that person in question didn't want to rate on a low scale.  With such cases, fill NaN values with 0.  

In [8]:
df_nan_exp_ratings_all = subset_df[(subset_df.attr_o.isnull()) & (subset_df.sinc_o.isnull()) & (subset_df.fun_o.isnull()) &
                   (subset_df.intel_o.isnull()) &  (subset_df.amb_o.isnull()) & (subset_df.shar_o.isnull()) & (subset_df.attr.isnull()) & (subset_df.sinc.isnull()) & (subset_df.fun.isnull()) &
                   (subset_df.intel.isnull()) &  (subset_df.amb.isnull()) & (subset_df.shar.isnull())]

len(df_nan_exp_ratings_all)

132

In [305]:
df_nan_exp_ratings_all  #USE LATER WHEN CHANGING MATCH_COUNT

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
839,59,67.0,0,,10,0,0,0,,,...,,,,,,,,,,
842,59,70.0,0,,10,0,0,0,,,...,,,,,,,,,,
843,59,71.0,0,,10,0,0,0,,,...,,,,,,,,,,
845,59,73.0,0,,10,0,0,0,,,...,,,,,,,,,,
847,59,75.0,0,,10,0,0,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8220,545,527.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8235,546,520.0,1,30.0,22,0,0,0,8.0,,...,,,,,,,,,,
8271,548,512.0,1,30.0,22,0,0,0,7.0,,...,,,,,,,,,,
8344,551,519.0,1,27.0,22,0,0,0,3.0,,...,,,,,,,,,,


*Drop the above data*

In [9]:
len(subset_df) #test

8378

In [10]:
subset_df_clean = subset_df.drop(df_nan_exp_ratings_all.index)  #create new df: subset_df_clean
len(subset_df_clean) #test

8246

#### Remove any data where participant didn't fill out survey questions pertaining to test question:

In [11]:
#create sub-df to pull any data with missing values for features 3_1; 3_1; 5_1; 5_2
df_viewself_nan = subset_df_clean[(subset_df_clean.attr3_1.isnull()) & (subset_df_clean.sinc3_1.isnull()) & 
                                  (subset_df_clean.intel3_1.isnull()) & (subset_df_clean.attr3_2.isnull()) &
                                  (subset_df_clean.amb5_1.isnull()) & (subset_df_clean.attr5_2.isnull()) & 
                                  (subset_df_clean.attr5_1.isnull())] 
                                  

len(df_viewself_nan)

44

In [12]:
#view data
df_viewself_nan[['iid', 'attr', 'attr_o', 'attr5_1', 'fun3_2', 'match_es', 'attr3_s']]

Unnamed: 0,iid,attr,attr_o,attr5_1,fun3_2,match_es,attr3_s
312,28,3.0,8.0,,,3.0,
313,28,2.0,8.0,,,3.0,
314,28,3.0,7.0,,,3.0,
315,28,4.0,5.0,,,3.0,
316,28,4.0,5.0,,,3.0,
...,...,...,...,...,...,...,...
6405,414,10.0,6.0,,,2.0,8.0
6406,414,10.0,7.0,,,2.0,8.0
6407,414,7.0,4.0,,,2.0,8.0
6408,414,7.0,7.0,,,2.0,8.0


In [13]:
subset_df_clean = subset_df_clean.drop(df_viewself_nan.index)  #create new df: subset_df_clean
len(subset_df_clean) #test

8202

In [14]:
#create sub-df to pull any data with missing values for features 3_s
df_viewself_nan2 = subset_df_clean[(subset_df_clean.attr3_s.isnull()) & (subset_df_clean.sinc3_s.isnull()) & 
                                  (subset_df_clean.intel3_s.isnull()) & (subset_df_clean.fun3_s.isnull()) &
                                  (subset_df_clean.amb3_s.isnull())] 
                                  

len(df_viewself_nan2)  #choosing not to drop b/c too many observations

4239

In [15]:
#test for NaN values with 'match'
df_match_nan = subset_df_clean[(subset_df_clean.match.isnull())]
len(df_match_nan)

0

In [16]:
#Grabing subject data with no scores for partner (including  like & prob) but subject chose 'yes' 
#(partner has scored subject)
df_nan_atr_like_prob_dec1 = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) &
                            (subset_df_clean.like.isnull()) & (subset_df_clean.prob.isnull()) 
                            & (subset_df_clean.dec == 1)]
len(df_nan_atr_like_prob_dec1) 

1

In [17]:
len(subset_df_clean) #test

8202

In [18]:
#dropping rows of data from df
subset_df_clean = subset_df_clean.drop(df_nan_atr_like_prob_dec1.index)  #create new df: subset_df_clean
len(subset_df_clean) #test

8201

In [19]:
#Grabing subject data with no scores for partner but has a score for like 
#(determine if data is irrelvant & should be dropped or if attribute scores for partner should be filled in)
#(partner has scored subject)
df_nan_atr_likeV = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) &
                            (subset_df_clean.like.notnull())]
len(df_nan_atr_likeV) 

5

In [20]:
df_nan_atr_likeV[['iid', 'pid', 'match', 'dec', 'dec_o', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                 'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o','attr5_1', 'attr5_2', 'intel5_2', 'attr3_s']]

Unnamed: 0,iid,pid,match,dec,dec_o,match_es,like,prob,like_o,prob_o,...,fun,amb,shar,attr_o,sinc_o,intel_o,attr5_1,attr5_2,intel5_2,attr3_s
705,50,32.0,0,1,0,5.0,7.0,7.0,6.0,3.0,...,,,,7.0,8.0,8.0,,,,
711,50,38.0,1,1,1,5.0,7.0,8.0,7.0,6.0,...,,,,8.0,8.0,8.0,,,,
712,50,39.0,0,0,1,5.0,8.0,,7.0,5.0,...,,,,5.0,6.0,8.0,,,,
2546,187,182.0,0,0,0,,5.0,6.0,9.0,8.0,...,,,,6.0,8.0,9.0,,,,9.0
7639,519,540.0,0,0,0,0.5,6.0,8.0,5.0,3.0,...,,,,6.0,7.0,6.0,8.0,,,


**Observations:** Data in first 4 rows looks like it should get dropped as those subject's didn't include any data for survey data re. their own attributes. 

Row 5, however, [index 7639] is interesting b/c person rated themselves higher on attraction with a higher prob partner would select them; while partner rated them lower for attr and a lower like score, thus resulting in no match. 

In [21]:
#dropping above 4 rows
subset_df_clean = subset_df_clean.drop([705, 711, 712, 2546])
len(subset_df_clean)

8197

In [22]:
##Look at same values but for partner (i.e. coded with _o at end)
#Partner didnt rate any attributes of subject but said 'yes'

df_nan_OTHER_atr_like_prob_dec1 = subset_df_clean[(subset_df_clean.attr_o.isnull()) 
                            & (subset_df_clean.sinc_o.isnull()) 
                            & (subset_df_clean.fun_o.isnull()) & (subset_df_clean.intel_o.isnull()) 
                             &  (subset_df_clean.amb_o.isnull()) & (subset_df_clean.shar_o.isnull()) &
                            (subset_df_clean.like_o.isnull()) & (subset_df_clean.prob_o.isnull()) 
                            & (subset_df_clean.dec_o == 1)]
len(df_nan_OTHER_atr_like_prob_dec1) 

1

In [23]:
df_nan_OTHER_atr_like_prob_dec1

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
7033,476,488.0,0,25.0,15,0,0,1,5.0,,...,6.0,5.0,4.0,3.0,,,,,,


In [24]:
##drop above data
subset_df_clean = subset_df_clean.drop([7033])
len(subset_df_clean)

8196

### NOT SURE IF SHOULD REMOVE BELOW COLUMNS?

In [25]:
#Grabing subject data with no scores for partner but partner scored subject
df_nan_atr = subset_df_clean[(subset_df_clean.attr.isnull()) & (subset_df_clean.sinc.isnull()) 
                            & (subset_df_clean.fun.isnull()) & (subset_df_clean.intel.isnull()) 
                             &  (subset_df_clean.amb.isnull()) & (subset_df_clean.shar.isnull()) ]
len(df_nan_atr) 

53

In [26]:
df_nan_atr[['iid', 'pid', 'attr', 'sinc', 'intel', 'fun', 'amb', 'shar','fun_o', 'match', 'dec', 'dec_o', 'like', 'like_o', 'attr3_1', 'attr3_2', 'attr3_s', 'match_es']]

Unnamed: 0,iid,pid,attr,sinc,intel,fun,amb,shar,fun_o,match,dec,dec_o,like,like_o,attr3_1,attr3_2,attr3_s,match_es
245,23,53.0,,,,,,,7.0,0,0,1,,7.0,5.0,5.0,,3.0
920,67,58.0,,,,,,,5.0,0,0,1,,6.0,5.0,7.0,,3.0
2346,170,144.0,,,,,,,6.0,0,0,0,,5.0,7.0,6.0,6.0,
2540,187,176.0,,,,,,,5.0,0,0,0,,2.0,8.0,8.0,9.0,
2542,187,178.0,,,,,,,5.0,0,0,0,,4.0,8.0,8.0,9.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8002,535,529.0,,,,,,,5.0,0,0,1,,5.0,6.0,7.0,,8.0
8003,535,530.0,,,,,,,9.0,0,0,1,,8.0,6.0,7.0,,8.0
8045,537,528.0,,,,,,,4.0,0,0,0,,1.0,7.0,8.0,,2.0
8067,538,528.0,,,,,,,6.0,0,0,0,,2.0,7.0,7.0,,4.0


*Holding off droping above data*

In [27]:
#Checking data for feature 5_1 & 5_2
df_nan_5 = subset_df_clean[(subset_df_clean.attr5_1.isnull()) & (subset_df_clean.sinc5_1.isnull()) 
                            & (subset_df_clean.fun5_1.isnull()) & (subset_df_clean.intel5_1.isnull()) 
                             &  (subset_df_clean.amb5_1.isnull()) & (subset_df_clean.attr5_2.isnull())
                           & (subset_df_clean.sinc5_2.isnull())  & (subset_df_clean.fun5_2.isnull()) 
                           & (subset_df_clean.intel5_2.isnull()) 
                             &  (subset_df_clean.amb5_2.isnull())]
len(df_nan_5) 

3364

** OBSERVATIONS: above returns alot of observations; too many to remove?**

**Function to check if lengths of dropna() values are the same**

If return 'FALSE':  all values are present or all are missing (i.e. subject didn't answer that question).
If return 'TRUE':  there are rows of data that are missng a value for the feature; e.g. person didn't want to rate low for other

In [28]:
def checkMissing (feature1, feature2, df):
    if (len(df[feature1].dropna())) != (len(df[feature2].dropna())):
        print True
    else:
        print False

        
        
        #UNCESSISARY LIKE THIS 

### ISSUE W/ FUNCTION - ITERATE THROUGH ?

In [None]:
def checkMissing (feature1, [feature_list], df):
    list = []
    for feature in df.iteritems():
        if (len(df[feature1].dropna())) != (len(df[feature2].dropna())):
        print True
    else:
        print False
        
        
  list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False: 
            sum = iidsubset[feature].sum()
            list.append(sum)
        else: 
            list.append(np.NaN)
    return pd.Series(list)



        def checkMissing (feature1, feature2, df):
    list = []
    for feature1 in df:
        if (len(df[feature1].dropna())) != (len(df[feature2].dropna())):
            i = df.index[feature1]
            list.append(i)
        else:
            print False

In [29]:
checkMissing('attr3_s', 'sinc3_s',subset_df_clean)

False


### Function to clean df: 

***removes old values and replaced with cleaned values***

In [30]:
def cleanDF(main_df, df_all_feature_null, feature):
    atr_nan = main_df[main_df[feature].isnull()] #look where partner did not rate subject on feature
    atr_cleaned = atr_nan.drop(df_all_feature_null.index) #returning just rows with NaN data in feature but have other columns with data
    #amt = len(atr_cleaned) #test
    fillNaN([feature], atr_cleaned)  ##calling function fillNaN to replace NaN values with 0
    main_df = main_df.drop(atr_cleaned.index)  #removing old values
    main_df = pd.concat([main_df, atr_cleaned]) #adding cleaned data back into df
    return main_df

    
#main_df
#df_all_feature_null = all atr_o_NaN or all atr_NaN
#feature = looking for NaN values of this attribute
##atr_nan = df that holds 1 attribute with NaN values
##atr_cleaned = df that holds values where feature in question has NaN value but other attributes have ratings 
    #e.g. looking at attr_o; atr_fillNaN = rows that have NaN values for attr_o but have values for intel_o, since_o...etc



In [31]:
#creating seperate DF where NaN values will be re-added
subset_df_clean_edit = subset_df_clean[:]
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7.0,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10.0,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,526.0,1,25.0,22,0,0,1,10.0,,...,5.0,5.0,,,10.0,5.0,3.0,2.0,6.0,5.0
8374,552,527.0,1,25.0,22,0,0,0,10.0,,...,8.0,4.0,4.0,,6.0,3.0,7.0,3.0,7.0,2.0
8375,552,528.0,1,25.0,22,0,0,0,10.0,,...,8.0,8.0,8.0,,2.0,1.0,2.0,2.0,2.0,1.0
8376,552,529.0,1,25.0,22,0,0,1,10.0,,...,5.0,4.0,,5.0,5.0,7.0,5.0,5.0,3.0,6.0


In [32]:
#runing above function to see if values don't line up with attr_o nan and since_o nan

checkMissing('attr_o', 'sinc_o',subset_df_clean_edit)

True


In [33]:
#pulling data for NaN values for partner rating of subject (all attribute_o ratings that are blank)
df_atr_o_null = subset_df_clean_edit[(subset_df_clean_edit.attr_o.isnull()) & (subset_df_clean_edit.sinc_o.isnull()) 
                            & (subset_df_clean_edit.fun_o.isnull()) & (subset_df_clean_edit.intel_o.isnull()) 
                             &  (subset_df_clean_edit.amb_o.isnull()) & (subset_df_clean_edit.shar_o.isnull())]

df_atr_o_null

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
386,32,50.0,0,27.0,16,0,0,1,3.0,0.0,...,8.0,9.0,8.0,4.0,,,,,,
482,38,50.0,0,23.0,16,1,1,1,2.0,12.0,...,8.0,9.0,9.0,7.0,,,,,,
498,39,50.0,0,24.0,16,0,1,0,2.0,5.0,...,8.0,10.0,8.0,,,,,,,
739,52,28.0,1,21.0,19,0,0,0,5.0,1.0,...,7.0,5.0,6.0,5.0,,,,,,
753,53,23.0,1,28.0,19,0,1,0,6.0,9.0,...,8.0,7.0,8.0,6.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8102,540,519.0,1,24.0,22,0,0,0,7.0,,...,6.0,6.0,5.0,2.0,,,,,,
8192,544,521.0,1,23.0,22,0,1,0,5.0,,...,8.0,7.0,7.0,6.0,,,,,,
8298,549,517.0,1,28.0,22,0,0,0,5.0,,...,0.0,0.0,0.0,0.0,,,,,,
8302,549,521.0,1,28.0,22,0,1,0,5.0,,...,8.0,7.0,8.0,7.0,,,,,,


In [34]:
#testing cleanDF function on attr_o (uses df_atr_o_nul)
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'attr_o')
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6,8.0,8.0,8.0,8.0,6.0
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7,8.0,10.0,7.0,7.0,5.0
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10,10.0,10.0,10.0,10.0,10.0
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7,8.0,9.0,8.0,9.0,8.0
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1926,143,172.0,0,33.0,16,0,1,0,6.0,,...,9.0,8.0,8.0,,0,8.0,9.0,2.0,8.0,1.0
2471,180,187.0,0,24.0,10,0,0,1,6.0,,...,5.0,5.0,5.0,4.0,0,,7.0,7.0,,
4773,316,307.0,1,25.0,14,1,1,1,5.0,,...,7.0,6.0,6.0,4.0,0,7.0,7.0,8.0,7.0,7.0
7246,490,476.0,1,29.0,15,0,1,0,3.0,,...,8.0,8.0,7.0,6.0,0,5.0,8.0,6.0,7.0,3.0


In [35]:
#running function cleanDF on sinc_o 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'sinc_o')

In [36]:
#running function cleanDF on intl_o 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'intel_o')

In [37]:
#running function cleanDF on fun_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'fun_o')

In [38]:
#running function cleanDF on amb_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'amb_o')

In [39]:
#running function cleanDF on shar_o
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_o_null , 'shar_o')

In [40]:
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,7.0,6.0,5.0,6,8,8,8,8,6
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7.0,8.0,5.0,6.0,7,8,10,7,7,5
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9.0,8.0,5.0,7.0,10,10,10,10,10,10
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8.0,7.0,6.0,8.0,7,8,9,8,9,8
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7.0,7.0,6.0,6.0,8,7,9,6,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,219,201.0,1,26.0,20,0,0,0,7.0,,...,6.0,5.0,5.0,6.0,7,0,0,0,0,0
3604,253,286.0,0,33.0,21,0,0,0,7.0,,...,6.0,6.0,7.0,5.0,6,0,0,0,0,0
5470,365,347.0,1,30.0,20,0,0,1,8.0,,...,8.0,4.0,3.0,3.0,9,0,0,0,0,0
6306,409,383.0,1,23.0,18,0,0,0,7.0,,...,6.0,5.0,6.0,3.0,1,0,0,0,0,0


In [41]:
#Grabing subject data with no scores for partner but partner scored subject
df_atr_null = subset_df_clean_edit[(subset_df_clean_edit.attr.isnull()) & (subset_df_clean_edit.sinc.isnull()) 
                            & (subset_df_clean_edit.fun.isnull()) & (subset_df_clean_edit.intel.isnull()) 
                             &  (subset_df_clean_edit.amb.isnull()) & (subset_df_clean_edit.shar.isnull()) ]
len(df_atr_null) 

53

In [42]:
#running function cleanDF on attr
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'attr')

In [43]:
#running function cleanDF on sinc 
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'sinc')

In [44]:
#running function cleanDF on intl
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'intel')

In [45]:
#running function cleanDF on fun
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'fun')

In [46]:
#running function cleanDF on amb
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'amb')

In [47]:
#running function cleanDF on shar
subset_df_clean_edit = cleanDF(subset_df_clean_edit, df_atr_null , 'shar')

In [48]:
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7,7,6,5,6,8,8,8,8,6
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7,8,5,6,7,8,10,7,7,5
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9,8,5,7,10,10,10,10,10,10
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8,7,6,8,7,8,9,8,9,8
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7,7,6,6,8,7,9,6,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5136,347,365.0,0,26.0,18,0,1,0,5.0,,...,0,0,0,0,4,8,8,4,3,3
5832,383,409.0,0,27.0,19,0,0,0,4.0,,...,0,0,0,0,5,6,6,5,6,3
6607,435,453.0,0,22.0,14,0,0,0,5.0,,...,0,0,0,0,4,4,4,4,4,4
2541,187,177.0,1,26.0,10,0,0,0,6.0,,...,0,0,0,0,4,6,7,5,0,4


In [49]:
#Checking data for like
subset_df_clean_edit['like'].unique()

array([  7. ,   6. ,   8. ,   5. ,   9. ,   4. ,  10. ,   2. ,   3. ,
         6.5,   nan,   1. ,   8.5,   9.5,   0. ,   7.5,   5.5,   4.5,   9.7])

In [50]:
like_nan = subset_df_clean_edit[subset_df_clean_edit.like.isnull()]
like_nan[['iid', 'pid', 'like', 'like_o', 'dec', 'attr', 'attr5_1', 'sinc5_1', 'fun5_1', 'intel5_1', 'amb5_1', 'dec', 'dec_o']]

Unnamed: 0,iid,pid,like,like_o,dec,attr,attr5_1,sinc5_1,fun5_1,intel5_1,amb5_1,dec.1,dec_o
245,23,53.0,,7.0,0,,,,,,,0,1
1289,91,105.0,,8.0,1,7,,,,,,1,1
2061,152,163.0,,5.0,0,7,,,,,,0,0
2065,152,167.0,,7.0,1,7,,,,,,1,1
2346,170,144.0,,5.0,0,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2923,209,229.0,,3.0,0,8,,,,,,0,0
2927,209,233.0,,7.0,0,5,,,,,,0,0
2913,209,219.0,,5.0,0,6,,,,,,0,0
703,50,30.0,,8.0,0,6,,,,,,0,1


In [52]:
#Creating new DF to reflect any changes made
df_atr_edit = df_nan_atr.drop(like_nan.index) #Removing the above columns from new edited df
len(df_exp_ratings_edit) #test

ValueError: labels [1289 2061 2065 2908 2919 3864 4800 6453 6888 7006 7063 7087 7088 7090 7097
 7099 7100 7178 7180 7725 3314  361 6827 2920 2921 6870  537  538  539  540
 2015 2915 2917 3412 7096  541 2911 2918 2909 2916 2926 2289 2910 2914 7098
 7505 2925 8037 2912 2923 2927 2913  703  911] not contained in axis

In [None]:
### NOT SURE IF I SHOULD REMOVE ANY OF THE DATA OR ADD 0 TO CERTAIN LIKE NAN VALUES 
#(SIMILAR TO THE ABOVE CLEANDF())

## NEED TO CREATE FUNCTION TO RETURN NEW MET_COUNT SCORES IF ITEM WAS DELETED

In [85]:
def recountMET(df):
    #list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        val = len(iidsubset)
        df.loc[df.iid == iid, 'round'] = [val]*val  #add it to a column in the original dataframe, not the iidsubset one -- for where the iid is equal to the one being calculated in your loop (hence the loc[] business)
        #iidsubset['round'] = pd.Series([val]*val)  #takes a list containing a number like [5] and replicates it 5 times, resulting in [5,5,5,5,5]
            #replicate that value for all rows containing that iid
        #list.append(val)
    #return list
    

In [None]:
def recountMET(df):
    df['round'] = (len(df))
    return df['round']

In [89]:
df_nan_exp_ratings_all[['iid', 'round']]  #Pullin from DF above that rows were dropped from

Unnamed: 0,iid,round
839,59,10
842,59,10
843,59,10
845,59,10
847,59,10
...,...,...
8220,545,22
8235,546,22
8271,548,22
8344,551,22


In [90]:
iid59 = df_nan_exp_ratings_all[(df_nan_exp_ratings_all.iid == 59)]
iid59

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
839,59,67.0,0,,10,0,0,0,,,...,,,,,,,,,,
842,59,70.0,0,,10,0,0,0,,,...,,,,,,,,,,
843,59,71.0,0,,10,0,0,0,,,...,,,,,,,,,,
845,59,73.0,0,,10,0,0,0,,,...,,,,,,,,,,
847,59,75.0,0,,10,0,0,0,,,...,,,,,,,,,,


In [91]:
recountMET(iid59)

In [92]:
iid59['round'] == recountMET(iid59)
iid59

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
839,59,67.0,0,,5,0,0,0,,,...,,,,,,,,,,
842,59,70.0,0,,5,0,0,0,,,...,,,,,,,,,,
843,59,71.0,0,,5,0,0,0,,,...,,,,,,,,,,
845,59,73.0,0,,5,0,0,0,,,...,,,,,,,,,,
847,59,75.0,0,,5,0,0,0,,,...,,,,,,,,,,


In [93]:
test_df = subset_df_clean_edit[subset_df_clean_edit['round'] == 5]
test_df

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
1846,132,137.0,0,27.0,5,0,0,1,5.0,,...,7,9,6,9,8,8,9,8,9,7
1847,132,138.0,0,27.0,5,0,0,1,5.0,,...,8,8,8,8,8,7,4,6,6,4
1849,132,140.0,0,27.0,5,1,1,1,5.0,,...,9,9,9,8,7,7,7,7,7,7
1850,132,141.0,0,27.0,5,0,0,1,5.0,,...,8,8,7,7,8,7,7,7,7,8
1851,133,137.0,0,24.0,5,0,0,0,5.0,,...,7,8,6,8,6,10,10,6,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1881,139,132.0,1,37.0,5,0,1,0,8.0,,...,8,8,7,0,5,7,9,5,9,5
1885,139,136.0,1,37.0,5,0,1,0,8.0,,...,0,8,8,0,3,8,8,5,8,1
1865,135,141.0,0,26.0,5,1,1,1,6.0,,...,0,8,8,0,8,9,8,8,6,5
1884,139,135.0,1,37.0,5,0,0,1,8.0,,...,0,6,5,0,7,8,0,8,4,6


In [94]:
test_df['round'] = recountMET(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [95]:
df_nan_exp_ratings_all  #USE LATER WHEN CHANGING MATCH_COUNT

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
839,59,67.0,0,,10,0,0,0,,,...,,,,,,,,,,
842,59,70.0,0,,10,0,0,0,,,...,,,,,,,,,,
843,59,71.0,0,,10,0,0,0,,,...,,,,,,,,,,
845,59,73.0,0,,10,0,0,0,,,...,,,,,,,,,,
847,59,75.0,0,,10,0,0,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8220,545,527.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8235,546,520.0,1,30.0,22,0,0,0,8.0,,...,,,,,,,,,,
8271,548,512.0,1,30.0,22,0,0,0,7.0,,...,,,,,,,,,,
8344,551,519.0,1,27.0,22,0,0,0,3.0,,...,,,,,,,,,,


In [96]:
df545 = df_nan_exp_ratings_all[df_nan_exp_ratings_all.iid == 545]
df545

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
8203,545,510.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8204,545,511.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8207,545,514.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8208,545,515.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8209,545,516.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8217,545,524.0,1,24.0,22,0,0,1,6.0,,...,,,,,,,,,,
8218,545,525.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8219,545,526.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,
8220,545,527.0,1,24.0,22,0,0,0,6.0,,...,,,,,,,,,,


In [97]:
df545['round'] == recountMET(df545)
df545

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
8203,545,510.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8204,545,511.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8207,545,514.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8208,545,515.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8209,545,516.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8217,545,524.0,1,24.0,9,0,0,1,6.0,,...,,,,,,,,,,
8218,545,525.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8219,545,526.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,
8220,545,527.0,1,24.0,9,0,0,0,6.0,,...,,,,,,,,,,


In [100]:
recountMET(subset_df_clean_edit)

In [101]:
subset_df_clean_edit

Unnamed: 0,iid,pid,gender,age,round,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7,7,6,5,6,8,8,8,8,6
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7,8,5,6,7,8,10,7,7,5
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9,8,5,7,10,10,10,10,10,10
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8,7,6,8,7,8,9,8,9,8
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7,7,6,6,8,7,9,6,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5136,347,365.0,0,26.0,16,0,1,0,5.0,,...,0,0,0,0,4,8,8,4,3,3
5832,383,409.0,0,27.0,18,0,0,0,4.0,,...,0,0,0,0,5,6,6,5,6,3
6607,435,453.0,0,22.0,14,0,0,0,5.0,,...,0,0,0,0,4,4,4,4,4,4
2541,187,177.0,1,26.0,9,0,0,0,6.0,,...,0,0,0,0,4,6,7,5,0,4


*******

## Q: Does one’s perception of themselves predict their dating outcomes? 

- Does this differ by gender? 
- Does this differ by age?

**Hypothesis**: people who have lower self esteem (i.e. negatively evaluate themselves by giving themselves lower scores on the attribute scale) will get less dates/matches; while those who give themselves higher ratings will get more.  Women are more likely to give themselves more critical ratings than men, thus negatively affecting their outcome. 

Look at how people view/score themselves alongside how others score them. 



##### General Variable KEY:

| Variable | Description |
| ---| ---|
|attr | Attractive|
|sinc |Sincere  |
|intel | Intelligent|
| fun | Fun|
| amb | Ambitious|
| shar |Shared Interests/Hobbies

***Each feature has a code at the end of the variable which references the survey question and when in the experiment the question was being asked*** (signup, during dating expirement, after dating experiment)

| Feature CODE | Scale | When during Experiment? |Question| 
| :------:| :------:| :------: |:------|
|**oPercveMe_1**| 1-10| Signup|How do you think others perceive you? |
|**oPercveMe_2**| 1-10| After event|How do you think others perceive you? |
|**iRateMe_exp**| 1-10| During event|Rate your opinion of your own attributes  |
|  **iMeasUp_1**| 1-10 | Signup| Based on what you think the opposite sex looks for in a date, how do you think you measure up?
|**iMeasUp_2**| 1-10| After event| Based on what you think the opposite sex looks for in a date, how do you think you measure up?
|  **attr; shar**| 1-10 | During event (after each date)| Subject's rating of parter |
|**attr_o; shar_o**| 1-10 | During event (after each date)| Partner's rating of subject|


| Feature | Scale | When during Experiment? |Question/Description| 
| :------:| :------:| :------: |:------|
|**exphappy** | 1-10 | Signup survey| Overall, on a scale of 1-10, how happy do you expect to be with the people you meet during the speed-dating event? |
|**expnum** | 0-20ppl |Signup survey | Out of the 20 people you will meet, how many do you expect will be interested in dating you?|
|**match_es** | *changes based on met_count* | End of experiment|  How many matches do you estimate you will get (a match occurs when you and your partner both check “Yes” next to decision)?|
| **dec** | 1=yes, 0=no | After each date round | Decision|
|**dec_o**|  1=yes, 0=no| After each date round | Decision of partner| 


**met_count**: number of people that subject met with during experiement

**match**:	1=yes | 0=no *determined after dating event if both subject and parter selected 'yes' under 'dec' on their scorecard*



**iid**: unique number for each subject

**gender**: 1=M | 0=F


****

### Renaming Features:

In [102]:
def renameFeature(feature, new_feature_name, df):
    df.rename(columns={feature: new_feature_name}, inplace = True)

In [103]:
def renameFeatures(feature_key, new_feature_key, df):
    columns = df.columns
    new_columns = [row.replace(feature_key,new_feature_key) for row in columns]
    df.rename(columns=dict(zip(columns, new_columns)), inplace=True)

Renaming the following features:

- round | met_count
- *for variables attr, sinc, intel, fun, amb*:
    - 3_1 | iMeasUp_1
    - 3_2 | iMeasUp_2
    - 5_1 | oPercveMe_1
    - 5_2 | oPercveMe_2
    - 3_s | iRateMe_exp


In [104]:
renameFeature('round', 'met_count', subset_df_clean_edit)

renameFeatures("3_1", "_iMeasUp_1", subset_df_clean_edit)
renameFeatures("3_2", "_iMeasUp_2", subset_df_clean_edit)
renameFeatures("5_1", "_oPercveMe_1", subset_df_clean_edit)
renameFeatures("5_2", "_oPercveMe_2", subset_df_clean_edit)
renameFeatures("3_s", "_iRateMe_exp", subset_df_clean_edit)


subset_df_clean_edit.columns  #to validate the output

Index([u'iid', u'pid', u'gender', u'age', u'met_count', u'match', u'dec',
       u'dec_o', u'exphappy', u'expnum', u'match_es', u'like', u'prob',
       u'like_o', u'prob_o', u'attr_iMeasUp_1', u'sinc_iMeasUp_1',
       u'fun_iMeasUp_1', u'intel_iMeasUp_1', u'amb_iMeasUp_1',
       u'attr_iMeasUp_2', u'sinc_iMeasUp_2', u'fun_iMeasUp_2',
       u'intel_iMeasUp_2', u'amb_iMeasUp_2', u'attr_oPercveMe_1',
       u'sinc_oPercveMe_1', u'intel_oPercveMe_1', u'fun_oPercveMe_1',
       u'amb_oPercveMe_1', u'attr_oPercveMe_2', u'sinc_oPercveMe_2',
       u'intel_oPercveMe_2', u'fun_oPercveMe_2', u'amb_oPercveMe_2',
       u'attr_iRateMe_exp', u'sinc_iRateMe_exp', u'intel_iRateMe_exp',
       u'fun_iRateMe_exp', u'amb_iRateMe_exp', u'attr', u'sinc', u'intel',
       u'fun', u'amb', u'shar', u'attr_o', u'sinc_o', u'intel_o', u'fun_o',
       u'amb_o', u'shar_o'],
      dtype='object')

******

# Compress Features Within Dataset to Get Averages & Sums

### Function for Sums:

Takes in a series (feature) & dataframe; Returns a series

In [105]:
def getSum(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False: 
            sum = iidsubset[feature].sum()
            list.append(sum)
        else: 
            list.append(np.NaN)
    return pd.Series(list)


### Functions to Get Averages of a Feature Set:

Takes in a series (feature) & dataframe; Returns a series

Function to return **average score**:  
(take sum of values and divide by met_count)

*use when each row of data for the feature is different (within a subject's data)* 

In [106]:
def getAveScore(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False:
            denom = iidsubset['met_count'].unique()[0]
            sum = iidsubset[feature].sum()
            divis = sum/denom
            list.append(divis)
        else: 
            list.append(np.NaN)
    return pd.Series(list)

In [107]:
#EXAMPLE

df_female_raw[['iid', 'attr_o', 'met_count']]
#get average scores of 'attr_o' for each iid 

NameError: name 'df_female_raw' is not defined

Function to return **average of 1 rating / met_count** 

*use when each row of data for the feature is the same (within a subject's data)* 


In [108]:
def getAve(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False:
            denom = iidsubset['met_count'].unique()[0] #returning 1 instance met_count
            nom = iidsubset[feature].unique()[0]  #returning 1 instance of the feature
            divis = nom/denom
            list.append(divis)
        else: 
            list.append(np.NaN)
    return pd.Series(list)

In [None]:
#EXAMPLE

df_female_raw[['iid', 'match_es', 'met_count']]
#want to get average of match_es (match_es/met_count)

### Function to Grab 1 Value of Feature Data for Each iid:
Takes in a series (feature) & dataframe; Returns a series

For features that have one consistent number for each person's iid #  (only want to return one instance of that number per iid #)

In [109]:
def getValueSet(feature, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False:
            denom = len(iidsubset)
            sum = iidsubset[feature].sum()
            divis = sum/denom
            list.append(divis)
        else:
            list.append(np.NaN)
    return pd.Series(list)


In [None]:
#test

getValueSet('met_count', df_female_raw)

#### Function to return expnum_ave (out of 20 people) and exphappy_ave (out of 10 pts):

Function takes in a feature, a value, and a dataframe and returns a series

In [110]:
def expAve(feature, denom, df):
    list = []
    for iid in df.iid.unique():
        iidsubset = df[df.iid == iid]
        if (iidsubset[feature].isnull().all()) == False:
            f = iidsubset[feature].unique()[0]
            ave = f / denom
            list.append(ave)
        else:
            list.append(np.NaN)
    return pd.Series(list)

#EXPNUM_AVE (/20ppl)

#EXPHAPPY_AVE  (out of/10PTS)

### New Feature Info:

> ####  new feature info here


>'Yes' Sum (#of decision = yes):

> dec_o sum = sum of 'yes' per men for women

****

### Seperating Dataset into Two datasets: 1 Female; 1 Male

In [111]:
df_female_raw = subset_df_clean_edit[subset_df_clean_edit.gender == 0]
df_female_raw

Unnamed: 0,iid,pid,gender,age,met_count,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
0,1,11.0,0,21.0,10,0,1,0,3.0,2.0,...,7,7,6,5,6,8,8,8,8,6
1,1,12.0,0,21.0,10,0,1,0,3.0,2.0,...,7,8,5,6,7,8,10,7,7,5
2,1,13.0,0,21.0,10,1,1,1,3.0,2.0,...,9,8,5,7,10,10,10,10,10,10
3,1,14.0,0,21.0,10,1,1,1,3.0,2.0,...,8,7,6,8,7,8,9,8,9,8
4,1,15.0,0,21.0,10,1,1,1,3.0,2.0,...,7,7,6,6,8,7,9,6,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5826,383,403.0,0,27.0,18,0,0,1,4.0,,...,0,0,0,0,5,7,6,6,0,0
2753,201,219.0,0,25.0,20,0,0,0,2.0,,...,0,0,0,0,5,8,6,5,5,6
5136,347,365.0,0,26.0,16,0,1,0,5.0,,...,0,0,0,0,4,8,8,4,3,3
5832,383,409.0,0,27.0,18,0,0,0,4.0,,...,0,0,0,0,5,6,6,5,6,3


In [112]:
df_male_raw = subset_df_clean_edit[subset_df_clean_edit.gender == 1]
df_male_raw

Unnamed: 0,iid,pid,gender,age,met_count,match,dec,dec_o,exphappy,expnum,...,intel,fun,amb,shar,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o
100,11,1.0,1,27.0,10,0,0,1,7.0,3.0,...,8,8,8,6,6,9,7,7,6,5
101,11,2.0,1,27.0,10,0,0,0,7.0,3.0,...,6,9,7,4,5,7,8,4,6,3
102,11,3.0,1,27.0,10,0,0,0,7.0,3.0,...,6,5,8,4,7,9,10,7,8,9
103,11,4.0,1,27.0,10,0,0,0,7.0,3.0,...,8,7,7,5,4,10,8,5,8,7
104,11,5.0,1,27.0,10,0,0,0,7.0,3.0,...,8,8,7,6,5,8,8,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710,50,37.0,1,27.0,16,0,0,1,5.0,7.0,...,0,0,0,0,6,3,6,8,4,4
911,66,59.0,1,29.0,10,0,0,0,5.0,3.0,...,0,0,0,0,,,,,,
4285,286,253.0,1,22.0,21,0,0,0,7.0,,...,0,0,0,0,6,7,6,6,7,5
2541,187,177.0,1,26.0,9,0,0,0,6.0,,...,0,0,0,0,4,6,7,5,0,4


### Drop and of the below NaN Values ??

In [None]:
len(df_female_raw.expnum.dropna())

In [None]:
df_female_raw.expnum.isnull()

In [None]:
test = pd.DataFrame()
test['iid'] = df_female_raw.iid.unique()
test

In [None]:
test['expnum'] = getValueSet('expnum', df_female_raw)
test

In [None]:
len(test.expnum.dropna())

In [None]:
test['gender'] = getValueSet('gender', df_female_raw)
test

### Function to Convert Values from Old DF to New Condensed DF:

In [None]:
def ConvertDF(DF, df):  #DF = old dataframe    #df = new data frame
    df['iid'] = DF.iid.unique()  #returning iid# (1 unique value per person - 1 row per subject)
    df['gender'] = getValueSet('gender', DF) #returning 1 row per iid with subject's gender
    df['age'] = getValueSet('age', DF) #returning 1 row per iid with subject's age
    df['met_count'] = getValueSet('met_count', DF) #returning 1 row per iid with met_count info (how many people each person met with)
    df['exphappy'] = getValueSet('exphappy', DF) #returning rating for exphappy per iid 
    df['expnum'] = getValueSet('expnum', DF) #returning expnum per iid (1 value)
    df['match_es'] = getValueSet('match_es', DF) 
    
    df['attr_iMeasUp_1'] = getValueSet('attr_iMeasUp_1', DF)
    df['sinc_iMeasUp_1'] = getValueSet('sinc_iMeasUp_1', DF)
    df['intel_iMeasUp_1'] = getValueSet('intel_iMeasUp_1', DF)
    df['fun_iMeasUp_1'] = getValueSet('fun_iMeasUp_1', DF)
    df['amb_iMeasUp_1']= getValueSet('amb_iMeasUp_1', DF)
    
    df['attr_iMeasUp_2'] = getValueSet('attr_iMeasUp_2', DF)
    df['sinc_iMeasUp_2'] = getValueSet('sinc_iMeasUp_2', DF)
    df['intel_iMeasUp_2'] = getValueSet('intel_iMeasUp_2', DF)
    df['fun_iMeasUp_2'] = getValueSet('fun_iMeasUp_2', DF)
    df['amb_iMeasUp_2']= getValueSet('amb_iMeasUp_2', DF)
    
    df['attr_oPercveMe_1'] = getValueSet('attr_oPercveMe_1', DF)
    df['sinc_oPercveMe_1'] = getValueSet('sinc_oPercveMe_1', DF)
    df['intel_oPercveMe_1'] = getValueSet('intel_oPercveMe_1', DF)
    df['fun_oPercveMe_1'] = getValueSet('fun_oPercveMe_1', DF)
    df['amb_oPercveMe_1'] = getValueSet('amb_oPercveMe_1', DF)
    
    df['attr_oPercveMe_2'] = getValueSet('attr_oPercveMe_2', DF)
    df['sinc_oPercveMe_2'] = getValueSet('sinc_oPercveMe_2', DF)
    df['intel_oPercveMe_2'] = getValueSet('intel_oPercveMe_2', DF)
    df['fun_oPercveMe_2'] = getValueSet('fun_oPercveMe_2', DF)
    df['amb_oPercveMe_2'] = getValueSet('amb_oPercveMe_2', DF)
    
    df['attr_iRateMe_exp'] = getValueSet('attr_iRateMe_exp', DF)
    df['attr_iRateMe_exp'] = getValueSet('attr_iRateMe_exp', DF)
    df['intel_iRateMe_exp'] = getValueSet('intel_iRateMe_exp', DF)
    df['fun_iRateMe_exp'] = getValueSet('fun_iRateMe_exp', DF)
    df['amb_iRateMe_exp'] = getValueSet('amb_iRateMe_exp', DF)
    
    df['match_sum'] = getSum('match', DF)
    df['dec_sum'] = getSum('dec', DF) #sum of subject's decisions (num of 'yes'')
    df['dec_o_sum'] = getSum('dec_o', DF) #sum of parnter's decisions (num of 'yes'')
    
    df['match_es_ave'] = getAve('match_es', DF)  #MATCH_ES_AVE  = % of people they think they'll match with
    df['like_ave'] = getAveScore('like', DF)  #LIKE_AVE ##?
    df['like_o_ave'] = getAveScore('like_o', DF)  #LIKE_O_AVE ##?
    df['prob_ave'] = getAveScore('prob', DF)  #PROB_AVE ##?
    df['prob_o_ave'] = getAveScore('prob_o', DF)  #PROB_O_AVE ##?
       
    df['dec_ave'] = getAveScore('dec', DF)  #DEC_AVE (average decision of subject to want to date)
    df['dec_o_ave'] = getAveScore('dec_o', DF)  #DEC_AVE (average dec of partners to want to date subject)
    df['match_ave'] = getAveScore('match', DF) #MATCH_AVE (average match count/#of people met with)
    
    df['expnum_ave'] = expAve('expnum', 20, df)
    df['exphappy_ave'] = expAve('exphappy', 10, df)
    
    df['attr_ave'] = getAveScore('attr', DF) 
    df['sinc_ave'] = getAveScore('sinc', DF)  
    df['intel_ave'] = getAveScore('intel', DF) 
    df['fun_ave'] = getAveScore('fun', DF) 
    df['amb_ave'] = getAveScore('amb',  DF) 
    df['shar_ave'] = getAveScore('shar', DF) 
    
    df['attr_o_ave'] = getAveScore('attr_o', DF) 
    df['sinc_o_ave'] = getAveScore('sinc_o', DF)  
    df['intel_o_ave'] = getAveScore('intel_o', DF) 
    df['fun_o_ave'] = getAveScore('fun_o', DF) 
    df['amb_o_ave'] = getAveScore('amb_o', DF) 
    df['shar_o_ave'] = getAveScore('shar_o', DF) 
    
    return df

### *Female Dataset*

In [None]:
#create empty DF
df_female_condensed = pd.DataFrame()

In [None]:
#Load condensed data into new df
ConvertDF(df_female_raw,df_female_condensed)

### *Male Dataset*

Create empty DF:

In [None]:
#create empty DF
df_male_condensed = pd.DataFrame()

In [None]:
#Load condensed data into new df
ConvertDF(df_male_raw,df_male_condensed)

## Set index to iid

In [None]:
#df = df.set_index('iid')

# NOTES ABOUT cleanDF FUNCTION: STEPS LISTED OUT

### dont run here

In [None]:
#looking where partner did not rate subject on attractiveness

attr_o_nan = subset_df_clean[subset_df_clean.attr_o.isnull()]
attr_o_nan

In [None]:
#returning rows that have no values in attr_o but have values in other ratings by partner 
#e.g. snce_o, intel_o...etc. 

df_attr_o_fillNaN = attr_o_nan.drop(df_nan_atr_o.index)
df_attr_o_fillNaN

In [None]:
#replacing all NaN values in atr_o with 0 (data that has info in other feature_o data)
fillNaN('attr_o', df_attr_o_fillNaN) 
df_attr_o_fillNaN 

In [None]:
# Removing old values 
subset_df_clean_edit = subset_df_clean_edit.drop(df_attr_o_fillNaN.index)
len(subset_df_clean_edit)

In [None]:
#Adding new values in 
subset_df_clean_edit = pd.concat([subset_df_clean_edit, df_attr_o_fillNaN])
len(subset_df_clean_edit)