### SPEED DATING EXPERIMENT (regression)

In [6]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, metrics, cross_validation, feature_selection

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')



pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

In [7]:
def linear_modeling_with_sklearn(X, y):
    model = linear_model.LinearRegression(fit_intercept = True)
    model.fit(X, y)

    print 'F-statistic (performed for each regressor independently)'
    print '- F-value', feature_selection.f_regression(X, y)[0]
    print '- p-value', feature_selection.f_regression(X, y)[1]
    print 'R^2 =', model.score(X, y)
    print 'Coefficients'
    print '- beta_0 (intercept) =', model.intercept_
    print '- beta_n (n > 0)     =', model.coef_

In [8]:
df = pd.read_csv(os.path.join('..', 'CODE', 'speed-dating-experiment', 'Speed Dating Data.csv'))

In [9]:
df

Unnamed: 0,iid,id,gender,idg,condtn,...,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,...,,,,,
1,1,1.0,0,1,1,...,,,,,
2,1,1.0,0,1,1,...,,,,,
3,1,1.0,0,1,1,...,,,,,
4,1,1.0,0,1,1,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,...,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,...,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,...,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,...,9.0,5.0,9.0,5.0,6.0


In [10]:
df.columns

Index([u'iid', u'id', u'gender', u'idg', u'condtn', u'wave', u'round',
       u'position', u'positin1', u'order',
       ...
       u'attr3_3', u'sinc3_3', u'intel3_3', u'fun3_3', u'amb3_3', u'attr5_3',
       u'sinc5_3', u'intel5_3', u'fun5_3', u'amb5_3'],
      dtype='object', length=195)

In [11]:
sorted(df.wave.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [12]:
df.wave.value_counts()

21    968
11    882
9     800
14    720
15    684
     ... 
10    162
16     96
20     84
18     72
6      50
Name: wave, dtype: int64

In [13]:
df.corr()

Unnamed: 0,iid,id,gender,idg,condtn,...,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
iid,1.000000,0.117109,0.051937,0.120362,0.220413,...,0.092325,-0.053698,0.006949,0.100164,0.055229
id,0.117109,1.000000,-0.011623,0.998029,0.318768,...,-0.018183,-0.007993,-0.088398,-0.046558,-0.119108
gender,0.051937,-0.011623,1.000000,0.033645,-0.002627,...,-0.103692,-0.289694,0.076538,-0.067432,0.055267
idg,0.120362,0.998029,0.033645,1.000000,0.324838,...,-0.020785,-0.022967,-0.086071,-0.050316,-0.118940
condtn,0.220413,0.318768,-0.002627,0.324838,1.000000,...,0.068552,-0.063407,0.068622,0.114254,0.145598
...,...,...,...,...,...,...,...,...,...,...,...
attr5_3,0.092325,-0.018183,-0.103692,-0.020785,0.068552,...,1.000000,0.197948,0.378176,0.406725,0.205368
sinc5_3,-0.053698,-0.007993,-0.289694,-0.022967,-0.063407,...,0.197948,1.000000,0.505440,0.331171,0.268553
intel5_3,0.006949,-0.088398,0.076538,-0.086071,0.068622,...,0.378176,0.505440,1.000000,0.270181,0.424972
fun5_3,0.100164,-0.046558,-0.067432,-0.050316,0.114254,...,0.406725,0.331171,0.270181,1.000000,0.398011


In [14]:
df['round'].value_counts()

18    1350
20    1160
22     968
21     882
10     880
      ... 
6      162
11     140
5       50
8       48
7       42
Name: round, dtype: int64

In [15]:
sorted(df['round'].unique())

[5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 18, 19, 20, 21, 22]

In [16]:
sorted(df.position.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

# Look at Match (1=yes, 0 = no)

In [22]:
df.corr().match

iid        -0.015806
id         -0.000770
gender     -0.000530
idg        -0.000544
condtn     -0.045960
              ...   
attr5_3     0.023098
sinc5_3    -0.031990
intel5_3   -0.013012
fun5_3      0.019793
amb5_3      0.008534
Name: match, dtype: float64

### Look at Match and Age

In [None]:
model = smf.ols(formula = 'match ~ age ', data = df).fit()
model.summary()

### Look at Match and Race

race_o:		race of partner

- Black/African American=1
- European/Caucasian-American=2
- Latino/Hispanic American=3
- Asian/Pacific Islander/Asian-American=4
- Native American=5
- Other=6


__imprace__: How important is it to you (on a scale of 1-10) that a person you date be of the same racial/ethnic background?


In [None]:
model = smf.ols(formula = 'match ~ race_o + race + imprace ', data = df).fit()
model.summary()

samerace:  	participant and the partner were the same race. 1= yes, 0=no

In [None]:
model = smf.ols(formula = 'match ~ samerace ', data = df).fit()
model.summary()

### Look at Match and Field

In [None]:
model = smf.ols(formula = 'match ~ field_cd ', data = df).fit()
model.summary()

#NEED TO TURN INTO DUMMY VALUE

field_cd: 	field coded 

- 1= Law 
- 2= Math
- 3= Social Science, Psychologist 
- 4= Medical Science, Pharmaceuticals, and Bio Tech 
- 5= Engineering  
- 6= English/Creative Writing/ Journalism 
- 7= History/Religion/Philosophy 
- 8= Business/Econ/Finance 
- 9= Education, Academia 
- 10= Biological Sciences/Chemistry/Physics
- 11= Social Work 
- 12= Undergrad/undecided 
- 13=Political Science/International Affairs 
- 14=Film
- 15=Fine Arts/Arts Administration
- 16=Languages
- 17=Architecture
- 18=Other

# Look at Age, Gender, Goal

### Gender:


- Female=0
- Male=1


In [None]:
df.gender.unique()

In [None]:
df.gender.value_counts()

In [None]:
len(df.gender.dropna())

### Age:

In [None]:
df.age.unique()

In [None]:
df.age.value_counts()

### Goal:

goal:
What is your primary goal in participating in this event? 
- Seemed like a fun night out=1
- To meet new people=2
- To get a date=3
- Looking for a serious relationship=4
- To say I did it=5
- Other=6


In [None]:
df.goal.value_counts()

In [None]:
df.goal.unique()

In [None]:
len(df.goal.dropna())

In [None]:
#look at goal distribution 
print np.nan, df.goal.isnull().sum()
for goals in np.sort(df.goal.dropna().unique()):
    print goals, len(df[df.goal == goals])

In [None]:
df = df[df.goal.isin([1, 2, 3, 4, 5, 6])]

#counts for goal - NAN values --> only picking 1-6

In [None]:
print np.nan, df.goal.isnull().sum()
df.goal.value_counts()

#removed all NAN values

In [None]:
df.goal.value_counts()

#checking that NAN values were removed

In [None]:
sorted(df.goal.unique())

#checking again

#### *Create Dummy Variables*

In [None]:
goal_df = pd.get_dummies(df.goal, prefix = 'Goal')
goal_df

In [None]:
#rename:

goal_df.rename(columns={'Goal_1.0': 'goal_1',
                        'Goal_2.0': 'goal_2',
                        'Goal_3.0': 'goal_3',
                        'Goal_4.0': 'goal_4',
                        'Goal_5.0': 'goal_5',
                        'Goal_6.0': 'goal_6'}, inplace = True)

#### *Join dummy set with dataset and remove 'goal' column:*

In [None]:
df = df.join([goal_df])  #joining dummy variables into dataset

df.drop('goal', axis = 1, inplace = True)  #remove the column 'goal'

### Match ~ Goal + Age + Gender * Age + Gender with StatsModels / sklearn  ???? --> DOESNT WORK?

In [None]:
#X = df[ ['age', 'goal_1'] ]
#y = df.match

#linear_modeling_with_sklearn(X, y)

#smf.ols(formula = 'match ~ goal + age + gender + age * gender', data = df).fit().summary()

## Odds / OR of Goal & Gender

female = 0
male = 1


goal:
What is your primary goal in participating in this event? 
- Seemed like a fun night out=1
- To meet new people=2
- To get a date=3
- Looking for a serious relationship=4
- To say I did it=5
- Other=6



#### Goal 1: Seemed like a fun night out

In [None]:
pd.crosstab(df.gender, df.goal_1, margins=True)

Odds of being male & choosing to participate b/c it 'seemed like a fun night out' (which i'd argue indicated the feelings that it was NOT LONG TERM/casual): 1739:2435

Odds of being female & choosing to participate b/c it 'seemed like a fun night out' (casual): 1687:2438

__ODDS RATIO__: (1739/2435) / (1687/2438) = 1.03

*odds of a male looking for a casual night out : odds of being a female and looking for a casual night out*

1.03:1

    Men more likely to be participating for a casual night out but not by much

#### Goal 2: To meet new people

In [None]:
pd.crosstab(df.gender, df.goal_2, margins=True)

Odds of being male & choosing to participate to meet new people:  1365:2809

Odds of being female & choosing to participate to meet new people: 1647:2478

__ODDS RATIO__: (1365/2809) / (1647:2478) = .73

*odds of a male looking to meet new people : odds of being a female and looking to meet new people*

.73:1

    Men are 27% less likely to look to meet new people than women

#### Goal 3: To get a date

In [None]:
pd.crosstab(df.gender, df.goal_3, margins=True)

Odds of looking trying to get a date & being male:  434:3740

Odds of looking trying to get a date & being female:  197:3928

__ODDS RATIO__: (434/3740) / (197/3928) = 2.314

*odds of a male looking for a date : odds of being a female and looking for a date*

2.314:1

    More men looking for a date than women in this study

#### Goal 4: Looking for a serious relationship

In [None]:
pd.crosstab(df.gender, df.goal_4, margins=True)

Odds of looking for a serious relationship and being male:  172:4002

Odds of looking for a serious relationship and not being male:  129:3996

ODDS RATIO: (172/4002) / (129/3996) = 1.33 

*odds of a male looking for a serious relationship : odds of being a female and looking for a serious relationship*

1.33:1

    More men looking for serious relationship than women in this study

#### Goal 5: To say I did it

In [None]:
pd.crosstab(df.gender, df.goal_5, margins=True)

Odds of being male & choosing to participate to 'say i did it':  234:3940

Odds of being female & choosing to participate to 'say i did it': 276:3849

__ODDS RATIO__: (234/3940) / (276/3849) = .83

*odds of a male participating to 'say he went on a speed dating experiment' : odds of being a female participating to'say she went on a speed dating experiment'*

.83:1

    Men are 17% less likely to participate in the speed dating experiement just to say they have 
    (observation: perhaps speed dating is more geared towards women)

#### Goal 6: Other

In [None]:
pd.crosstab(df.gender, df.goal_6, margins=True)

Odds of choosing 'Other' if male: 230:3944

Odds of choosing 'Other' if female: 189:3936

__ODDS RATIO__: (230/3944) / (189/3936) = 1.22

*odds of a male selecting 'other' as reason for participating: odds of a female selecting 'other' as reason for participating*

1.22:1

    Men are 1.22x more likely to select other when deciding reason for participating in study