# DS-SF-33 | Unit Project 3: Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [175]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

import statsmodels.formula.api as smf

from sklearn import linear_model

In [176]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'dataset-ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether an applicant was admitted.

In [61]:
# TODO
pd.crosstab(df.prestige,df.admit)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,28,33
2.0,95,53
3.0,93,28
4.0,55,12


In [18]:
pd.crosstab(df.prestige,df.admit,normalize=True)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.070529,0.083123
2.0,0.239295,0.133501
3.0,0.234257,0.070529
4.0,0.138539,0.030227


In [19]:
pd.crosstab(df.prestige,df.admit,normalize=True).sum()

admit
0    0.68262
1    0.31738
dtype: float64

In [20]:
pd.crosstab(df.prestige,df.admit,normalize='index')

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.459016,0.540984
2.0,0.641892,0.358108
3.0,0.768595,0.231405
4.0,0.820896,0.179104


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [177]:
# Let's rescast prestige as int....
df.prestige = df.prestige.astype(int)
df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [178]:
one_hot = pd.get_dummies(df.prestige)
one_hot

Unnamed: 0,1,2,3,4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
395,0,1,0,0
396,0,0,1,0
397,0,1,0,0
398,0,1,0,0


In [179]:
df = df.join(one_hot)
df

Unnamed: 0,admit,gre,gpa,prestige,1,2,3,4
0,0,380.0,3.61,3,0,0,1,0
1,1,660.0,3.67,3,0,0,1,0
2,1,800.0,4.00,1,1,0,0,0
3,1,640.0,3.19,4,0,0,0,1
4,0,520.0,2.93,4,0,0,0,1
...,...,...,...,...,...,...,...,...
395,0,620.0,4.00,2,0,1,0,0
396,0,560.0,3.04,3,0,0,1,0
397,0,460.0,2.63,2,0,1,0,0
398,0,700.0,3.65,2,0,1,0,0


> ### Question 3.  How many of these binary variables do we need for modeling?

Answer:  Binary variables are true/false values, or 0 or 1.  In this case we can take 2, 3, or 4 as our binary variables.

> ### Question 4.  Why are we doing this?

Answer:  1 can be removed (eg, keeping 2, 3, and 4 only) to reduce colinearity.  If 2, 3, and 4 are 0 then that row would indicate that the person went to a prestige school of 1.

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [180]:
# TODO
df.drop('prestige', axis=1, inplace=True)
print(df)


     admit    gre   gpa  1  2  3  4
0        0  380.0  3.61  0  0  1  0
1        1  660.0  3.67  0  0  1  0
2        1  800.0  4.00  1  0  0  0
3        1  640.0  3.19  0  0  0  1
4        0  520.0  2.93  0  0  0  1
..     ...    ...   ... .. .. .. ..
395      0  620.0  4.00  0  1  0  0
396      0  560.0  3.04  0  0  1  0
397      0  460.0  2.63  0  1  0  0
398      0  700.0  3.65  0  1  0  0
399      0  600.0  3.89  0  0  1  0

[397 rows x 7 columns]


## Part C.  Hand calculating odds ratios

Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether an applicant was admitted.

In [181]:
# TODO
Upper_prestige = df[df[1] == 1]
print (Upper_prestige)


     admit    gre   gpa  1  2  3  4
2        1  800.0  4.00  1  0  0  0
6        1  560.0  2.98  1  0  0  0
11       0  440.0  3.22  1  0  0  0
12       1  760.0  4.00  1  0  0  0
14       1  700.0  4.00  1  0  0  0
..     ...    ...   ... .. .. .. ..
368      0  580.0  4.00  1  0  0  0
372      1  680.0  2.42  1  0  0  0
373      1  620.0  3.37  1  0  0  0
383      0  660.0  4.00  1  0  0  0
385      0  420.0  3.02  1  0  0  0

[61 rows x 7 columns]


In [84]:
pd.crosstab(Upper_prestige[1],Upper_prestige.admit)

admit,0,1
1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,28,33


> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [85]:
# TODO
pd.crosstab(Upper_prestige[1],Upper_prestige.admit,normalize=True).sum()

admit
0    0.459016
1    0.540984
dtype: float64

> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [87]:
# TODO
Lower_prestige=df[df[1] == 0]
print (Lower_prestige)

     admit    gre   gpa  1  2  3  4
0        0  380.0  3.61  0  0  1  0
1        1  660.0  3.67  0  0  1  0
3        1  640.0  3.19  0  0  0  1
4        0  520.0  2.93  0  0  0  1
5        1  760.0  3.00  0  1  0  0
..     ...    ...   ... .. .. .. ..
395      0  620.0  4.00  0  1  0  0
396      0  560.0  3.04  0  0  1  0
397      0  460.0  2.63  0  1  0  0
398      0  700.0  3.65  0  1  0  0
399      0  600.0  3.89  0  0  1  0

[336 rows x 7 columns]


In [88]:
pd.crosstab(Lower_prestige[1],Lower_prestige.admit)

admit,0,1
1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,243,93


In [89]:
pd.crosstab(Lower_prestige[1],Lower_prestige.admit,normalize=True).sum()

admit
0    0.723214
1    0.276786
dtype: float64

> ### Question 9.  Finally, what's the odds ratio?

In [92]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [93]:
lm = LogisticRegression()

In [113]:
# TODO
lm.fit(df[[ 1]], df['admit'])
print (lm.coef_)
print (lm.intercept_)
print (df.admit.mean())



[[ 1.02810535]]
[-0.93151263]
0.31738035264483627


In [114]:
#odds ratio
print (np.exp(lm.coef_))

[[ 2.79576381]]


> ### Question 10.  Write this finding in a sentence.

Answer:  The above shows that you have a 179% increase chance of admittance if you come from a prestige school of 1 as opposed to any other prestige score.

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentence.

In [115]:
# TODO
Lowest_prestige=df[df[4] == 1]
print (Lowest_prestige)

     admit    gre   gpa  1  2  3  4
3        1  640.0  3.19  0  0  0  1
4        0  520.0  2.93  0  0  0  1
10       0  800.0  4.00  0  0  0  1
16       0  780.0  3.87  0  0  0  1
22       0  600.0  2.82  0  0  0  1
..     ...    ...   ... .. .. .. ..
329      0  500.0  2.93  0  0  0  1
337      0  620.0  3.09  0  0  0  1
340      0  500.0  3.23  0  0  0  1
342      0  500.0  3.95  0  0  0  1
375      0  560.0  3.49  0  0  0  1

[67 rows x 7 columns]


In [116]:
pd.crosstab(Lowest_prestige[4],Lowest_prestige.admit,normalize=True).sum()

admit
0    0.820896
1    0.179104
dtype: float64

In [117]:
lm.fit(df[[ 4]], df['admit'])
print (lm.coef_)
print (lm.intercept_)
print (df.admit.mean())

[[-0.80182622]]
[-0.64123288]
0.31738035264483627


In [118]:
#odds ratio
print (np.exp(lm.coef_))

[[ 0.44850914]]


Answer:  This shows that there is a 44.85 percent decrease in admittance if the prestige score is 4.

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model predicting admission into UCLA using `gre`, `gpa`, and the `prestige` of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [182]:
# TODO
#dropping prestige = 1
df2=df.drop(1, axis=1, inplace=True)
print(df)



     admit    gre   gpa  2  3  4
0        0  380.0  3.61  0  1  0
1        1  660.0  3.67  0  1  0
2        1  800.0  4.00  0  0  0
3        1  640.0  3.19  0  0  1
4        0  520.0  2.93  0  0  1
..     ...    ...   ... .. .. ..
395      0  620.0  4.00  1  0  0
396      0  560.0  3.04  0  1  0
397      0  460.0  2.63  1  0  0
398      0  700.0  3.65  1  0  0
399      0  600.0  3.89  0  1  0

[397 rows x 6 columns]


In [221]:
df['intercept'] = 1

In [222]:
import statsmodels.api as sm
print(df)

     admit    gre   gpa  2  3  4  intercept
0        0  380.0  3.61  0  1  0          1
1        1  660.0  3.67  0  1  0          1
2        1  800.0  4.00  0  0  0          1
3        1  640.0  3.19  0  0  1          1
4        0  520.0  2.93  0  0  1          1
..     ...    ...   ... .. .. ..        ...
395      0  620.0  4.00  1  0  0          1
396      0  560.0  3.04  0  1  0          1
397      0  460.0  2.63  1  0  0          1
398      0  700.0  3.65  1  0  0          1
399      0  600.0  3.89  0  1  0          1

[397 rows x 7 columns]


In [185]:
#put variables within an index
var_cols = df.columns[1:]
print (var_cols)

Index(['gre', 'gpa', 2, 3, 4, 'intercept'], dtype='object')


In [223]:
logit = sm.Logit(df['admit'], df[var_cols])

In [224]:
results = logit.fit()

Optimization terminated successfully.
         Current function value: 0.573854
         Iterations 6


> ### Question 13.  Print the model's summary results.

In [225]:
# TODO
#Print results

print (results.summary())

                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  397
Model:                          Logit   Df Residuals:                      391
Method:                           MLE   Df Model:                            5
Date:                Wed, 19 Apr 2017   Pseudo R-squ.:                 0.08166
Time:                        16:10:06   Log-Likelihood:                -227.82
converged:                       True   LL-Null:                       -248.08
                                        LLR p-value:                 1.176e-07
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
gre            0.0022      0.001      2.028      0.043      7.44e-05     0.004
gpa            0.7793      0.333      2.344      0.019         0.128     1.431
2             -0.6801      0.317     -2.146      0.0

> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [202]:
# TODO
#odds ratio
import numpy as np
import pandas as pd
print (np.exp(results.params))

gre          1.002221
gpa          2.180027
2            0.506548
3            0.262192
4            0.211525
intercept    0.020716
dtype: float64


In [190]:
params = results.params
conf = results.conf_int()
print (conf)

                  0         1
gre        0.000074  0.004362
gpa        0.127619  1.431056
2         -1.301337 -0.058936
3         -2.014579 -0.662776
4         -2.371624 -0.735197
intercept -6.116077 -1.637631


In [191]:
#Confidence intervals at 2.5% and 95%
print (np.exp(conf))

                  0         1
gre        1.000074  1.004372
gpa        1.136120  4.183113
2          0.272168  0.942767
3          0.133377  0.515419
4          0.093329  0.479411
intercept  0.002207  0.194440


> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

Answer:  We can interpret this by saying that the odds of being admitted to decrease by 27.2% if the prestige school is 2.

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer:  We can interpret this by saying that the odds of being admitted increases by 13.6% with each unit increase in your GPA.

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [227]:
# TODO

#predict Tier 1
predicted1 = -3.8769  + 0.0022*800 + 0.7793*4 
print (predicted1)

#predict Tier 2
predicted2 = -3.8769  + 0.0022*800 + 0.7793*4 + -0.6801
print (predicted2)

#predict Tier 3
predicted3 = -3.8769  + 0.0022*800 + 0.7793*4 + -1.3387
print (predicted3)

#predict Tier 4
predicted4 = -3.8769  + 0.0022*800 + 0.7793*4 + -1.5534
print (predicted4)

1.0002999999999997
0.3201999999999997
-0.33840000000000026
-0.5531000000000001


Answer:

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [228]:
# TODO
logreg = linear_model.LogisticRegression(C=10 ** 2)

In [229]:
#create x and y variables to be put in the logit regression model
y = df['admit']
x = df[var_cols]
print (x)

       gre   gpa  2  3  4  intercept
0    380.0  3.61  0  1  0          1
1    660.0  3.67  0  1  0          1
2    800.0  4.00  0  0  0          1
3    640.0  3.19  0  0  1          1
4    520.0  2.93  0  0  1          1
..     ...   ... .. .. ..        ...
395  620.0  4.00  1  0  0          1
396  560.0  3.04  0  1  0          1
397  460.0  2.63  1  0  0          1
398  700.0  3.65  1  0  0          1
399  600.0  3.89  0  1  0          1

[397 rows x 6 columns]


In [230]:
#fit the data based on the x and y variables
logreg.fit(x, y)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [231]:
# TODO
print (logreg.coef_)
print (logreg.intercept_)
print (np.exp(logreg.coef_))


[[ 0.00209769  0.71809211 -0.72960187 -1.40029508 -1.60285974 -1.77303041]]
[-1.77303041]
[[ 1.0020999   2.05051732  0.48210089  0.24652421  0.20131997  0.16981759]]


Answer:  Numbers appear to be very close for the GRE and prestige scores but very different for GPA.

> ### Question 20.  Again, assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [232]:
# TODO
# TODO

#predict Tier 1
predicted1 = -1.77303041  + 0.00209769*800 + .71809211*4 + -0.72960187
print (predicted1)

#predict Tier 2
predicted2 = -1.77303041  + 0.00209769*800 + 0.71809211*4 + -1.40029508
print (predicted2)

#predict Tier 3
predicted3 = -1.77303041  + 0.00209769*800 + 0.71809211*4 + -1.60285974
print (predicted3)

#predict Tier 4
predicted4 = -1.77303041  + 0.00209769*800 + 0.71809211*4 + -1.77303041
print (predicted4)

2.0478881600000003
1.37719495
1.17463029
1.00445962


Answer: