# Homework w05d02 - Pipelines

#### Load the graduate school admissions dataset. The aim of this exercise is to create a pipeline to chain various data transformations and to use gridsearch for logistic regression. 

In [1]:
import numpy as np
import pandas as pd

from sklearn import model_selection, linear_model, metrics
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

df = pd.read_csv("../week_4/w04_d04/admissions.csv").dropna()
df['prestige'] = df['prestige'].astype(int)
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.0,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4


#### 1. In class you saw the GetDummiesTransformer. Modify this transformer in such a way that in a given dataframe it replaces the categorical variable (prestige) with the corresponding dummy variables dropping the one which is redundant.

In [2]:
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.get_dummies(X[self.columns], columns = self.columns)
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self


gdt = GetDummiesTransformer(['prestige'])
gdt.fit_transform(df).head()

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0


#### Answer:

In [3]:
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            Y = pd.get_dummies(X[self.columns], columns = self.columns, drop_first=True)
            X = X.join(Y)
            X = X.drop(self.columns, axis=1)
            return X
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self


gdt = GetDummiesTransformer(['prestige'])
gdt.fit_transform(df).head()

Unnamed: 0,admit,gre,gpa,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0.0,1.0,0.0
1,1,660.0,3.67,0.0,1.0,0.0
2,1,800.0,4.0,0.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,1.0
4,0,520.0,2.93,0.0,0.0,1.0


#### 2. Construct a class which takes a dataframe as an input and returns the response and the feature matrix. Do this by modifying the ColumnSelector from the lab.

In [4]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self, X, *_):
        return self



#### Answer

In [5]:
class Get_ResponseandFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            y = X[self.columns]
            X = X.drop(self.columns, axis=1)
            return X, y
        else:
            raise TypeError("This transformer only works with Pandas Dataframes")
    
    def fit(self,X, y, *_):
        return self
    
    
grf = Get_ResponseandFeatures(['admit'])
X1, y1 = grf.transform(gdt.fit_transform(df))
print y1.head()
X1.head()

   admit
0      0
1      1
2      1
3      1
4      0


Unnamed: 0,gre,gpa,prestige_2,prestige_3,prestige_4
0,380.0,3.61,0.0,1.0,0.0
1,660.0,3.67,0.0,1.0,0.0
2,800.0,4.0,0.0,0.0,0.0
3,640.0,3.19,0.0,0.0,1.0
4,520.0,2.93,0.0,0.0,1.0


#### 3. Construct a pipeline which combines the two classes constructed in 1 and 2 and returns from a given dataframe the response vector and the feature matrix.

In [6]:
# columns or resp can be both, single strings or lists of strings, 
# but if you choose resp to be a list, you will have to transform y to a pandas
# series with y = pd.Series(y['admit']) before using logistic regression.
columns = ['prestige']
resp = 'admit'

pipe1 = Pipeline(steps=[('dummies', GetDummiesTransformer(columns)), 
                ('respfeat',Get_ResponseandFeatures(resp))])

X,y = pipe1.fit_transform(df,resp)


model = linear_model.LogisticRegression()
model.fit(X,y)
print model.score(X,y)
# do the following if resp is a list like resp = ['admit]
#y = pd.Series(y['admit'])

0.705289672544


#### 4. Feed the result into a logistic regression fit. Use GridSearchCV to find the best model fitting the data. Your gridsearch should use cross validation and a parameter grid for your choice of the input parameters of the logistic regression model (e.g. the penalty, fit with intercept, ...).

In [7]:
gs = model_selection.GridSearchCV(
    estimator = model,
    param_grid = {'C': [10**-i for i in range(-2, 4)],
                 'class_weight': [None, 'balanced'],
                 'penalty':['l1','l2'],
                 'fit_intercept':[True,False]},
    cv = 10,
    scoring = 'roc_auc'
    )

gs.fit(X, y)
print gs.best_params_, gs.best_score_
gs.grid_scores_

{'penalty': 'l2', 'C': 1, 'fit_intercept': True, 'class_weight': 'balanced'} 0.680733810457




[mean: 0.67654, std: 0.10244, params: {'penalty': 'l1', 'C': 100, 'fit_intercept': True, 'class_weight': None},
 mean: 0.67622, std: 0.10112, params: {'penalty': 'l2', 'C': 100, 'fit_intercept': True, 'class_weight': None},
 mean: 0.65264, std: 0.10844, params: {'penalty': 'l1', 'C': 100, 'fit_intercept': False, 'class_weight': None},
 mean: 0.65386, std: 0.10693, params: {'penalty': 'l2', 'C': 100, 'fit_intercept': False, 'class_weight': None},
 mean: 0.67731, std: 0.10399, params: {'penalty': 'l1', 'C': 100, 'fit_intercept': True, 'class_weight': 'balanced'},
 mean: 0.67674, std: 0.10435, params: {'penalty': 'l2', 'C': 100, 'fit_intercept': True, 'class_weight': 'balanced'},
 mean: 0.66706, std: 0.10688, params: {'penalty': 'l1', 'C': 100, 'fit_intercept': False, 'class_weight': 'balanced'},
 mean: 0.66676, std: 0.10696, params: {'penalty': 'l2', 'C': 100, 'fit_intercept': False, 'class_weight': 'balanced'},
 mean: 0.67712, std: 0.10234, params: {'penalty': 'l1', 'C': 10, 'fit_interc