# Homework w05d02 - Pipelines

#### Load the graduate school admissions dataset. The aim of this exercise is to create a pipeline to chain various data transformations and to use gridsearch for logistic regression. 

In [85]:
import numpy as np
import pandas as pd

from sklearn import model_selection, linear_model, metrics
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

# Import data, adjust your path
df = pd.read_csv("admissions.csv").dropna()
df['prestige'] = df['prestige'].astype(int)
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.0,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4


#### 1. In class you saw the GetDummiesTransformer. Modify this transformer in such a way that in a given dataframe it replaces the categorical variable (prestige) with the corresponding dummy variables dropping the one which is redundant.

In [104]:
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            dummies_df = pd.DataFrame(pd.get_dummies(X[self.columns],columns = self.columns))
            X_df = X.drop(self.columns, axis = 1)
            return pd.concat([X_df,dummies_df],axis=1)
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self


gdt = GetDummiesTransformer(['prestige'])
gdt.fit_transform(df).head()

Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0.0,0.0,1.0,0.0
1,1,660.0,3.67,0.0,0.0,1.0,0.0
2,1,800.0,4.0,1.0,0.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,0.0,1.0
4,0,520.0,2.93,0.0,0.0,0.0,1.0


#### Answer:

In [None]:
# I modified the function above directly

#### 2. Construct a class which takes a dataframe as an input and returns the response and the feature matrix. Do this by modifying the ColumnSelector from the lab.

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self



#### Answer

In [105]:
class ResponseFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, response):
        self.response = response
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            X_response = pd.DataFrame(X[self.response])
            X_predictors = pd.DataFrame(X.drop(self.response,axis=1))
            return (X_predictors,X_response)
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self

rfs = ResponseFeatureSelector('admit')
rfs.fit_transform(df)

(       gre   gpa  prestige
 0    380.0  3.61         3
 1    660.0  3.67         3
 2    800.0  4.00         1
 3    640.0  3.19         4
 4    520.0  2.93         4
 5    760.0  3.00         2
 6    560.0  2.98         1
 7    400.0  3.08         2
 8    540.0  3.39         3
 9    700.0  3.92         2
 10   800.0  4.00         4
 11   440.0  3.22         1
 12   760.0  4.00         1
 13   700.0  3.08         2
 14   700.0  4.00         1
 15   480.0  3.44         3
 16   780.0  3.87         4
 17   360.0  2.56         3
 18   800.0  3.75         2
 19   540.0  3.81         1
 20   500.0  3.17         3
 21   660.0  3.63         2
 22   600.0  2.82         4
 23   680.0  3.19         4
 24   760.0  3.35         2
 25   800.0  3.66         1
 26   620.0  3.61         1
 27   520.0  3.74         4
 28   780.0  3.22         2
 29   520.0  3.29         1
 ..     ...   ...       ...
 370  540.0  3.77         2
 371  680.0  3.76         3
 372  680.0  2.42         1
 373  620.0  3.37   

#### 3. Construct a pipeline which combines the two classes constructed in 1 and 2 and returns from a given dataframe the response vector and the feature matrix.

In [106]:
#Pipeline(steps=[gdt,rfs])
pipe1 = Pipeline([('gft', gdt), ('rfs', rfs)])
X,y = pipe1.fit_transform(df,['admit'])

In [107]:
X.head()

Unnamed: 0,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,380.0,3.61,0.0,0.0,1.0,0.0
1,660.0,3.67,0.0,0.0,1.0,0.0
2,800.0,4.0,1.0,0.0,0.0,0.0
3,640.0,3.19,0.0,0.0,0.0,1.0
4,520.0,2.93,0.0,0.0,0.0,1.0


#### 4. Feed the result into a logistic regression fit. Use GridSearchCV to find the best model fitting the data. Your gridsearch should use cross validation and a parameter grid for your choice of the input parameters of the logistic regression model (e.g. the penalty, fit with intercept, ...).

In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


logreg_f = LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']
gs = GridSearchCV(logreg_f, {'penalty':penalties, 'C':C_vals}, verbose=True, cv=5, scoring='accuracy')
gs.fit(X,np.ravel(y))
gs.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.3s finished


{'C': 100.0, 'penalty': 'l2'}

In [127]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

gs_logreg = LogisticRegression(C=gs.best_params_["C"], penalty=gs.best_params_["penalty"])
gs_logreg.fit(X, y)
Y_ = gs_logreg.predict(X)

print(classification_report(y, Y_))

conmat = confusion_matrix(y, Y_, labels=gs_logreg.classes_)
confusion = pd.DataFrame(conmat, index=['Failed', 'Passed'],columns=['predicted_Failed', 'predicted_Passed'])
confusion

             precision    recall  f1-score   support

          0       0.72      0.93      0.81       271
          1       0.60      0.22      0.32       126

avg / total       0.68      0.71      0.66       397



Unnamed: 0,predicted_Failed,predicted_Passed
Failed,252,19
Passed,98,28
