### In this Notebook I focus on pipline design, the first step in making deployment of the ML more robust 

Following up on what I did in base model 0, The workflow is as follows

### 1. using piplines

1. Scaling Data 
2. PCA and k best
3. RandomForestClassifier(), LogisticRegression(), SVC(), (which ever has better results)
4. Hyperparamters tunning using Gridsearch 

### 2. API Creation (Seperate .py file)



In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns  
from sklearn.metrics import f1_score
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest

In [25]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [26]:
df = df.drop(columns = ['Loan_ID', 'Dependents'])
df.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [27]:
# replace nulls with mean
df = df.fillna(df.mean().iloc[0])

In [28]:
# Change categorical into dummy variables
df['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
df['Married'].replace({'Yes':1,'No':0},inplace=True)
df['Gender'].replace({'Male':1,'Female':0},inplace=True)
df['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
df['Property_Area'].replace({'Urban':2,'Semiurban':1,'Rural':0},inplace=True)
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

In [29]:
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1.0,0.0,1,0.0,5849,0.0,5403.459283,360.0,1.0,2,1
1,1.0,1.0,1,0.0,4583,1508.0,128.000000,360.0,1.0,0,0
2,1.0,1.0,1,1.0,3000,0.0,66.000000,360.0,1.0,2,1
3,1.0,1.0,0,0.0,2583,2358.0,120.000000,360.0,1.0,2,1
4,1.0,0.0,1,0.0,6000,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,1,0.0,2900,0.0,71.000000,360.0,1.0,0,1
610,1.0,1.0,1,0.0,4106,0.0,40.000000,180.0,1.0,0,1
611,1.0,1.0,1,0.0,8072,240.0,253.000000,360.0,1.0,2,1
612,1.0,1.0,1,0.0,7583,0.0,187.000000,360.0,1.0,2,1


In [9]:
# Split the data set into training and test data 
X = df.drop(columns='Loan_Status')
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, stratify=y)

In [19]:
# Pipline design

feature_union = FeatureUnion([('pca', PCA()), 
                              ('select_best', SelectKBest())])


pipe = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', 'passthrough')])

# Find the best hyperparameters and model using GridSearchCV on the train set
param_grid = {'classifier': [RandomForestClassifier(), LogisticRegression(), SVC()],    # Which is better, Logistic Regression or an SVM Classifier?
              'features__pca__n_components': [3, 5],
              'features__select_best__k': [1, 3, 6]}

# create a Grid Search object
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)


In [18]:
best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy:\n\t {best_acc}\nAchieved with hyperparameters:\n\t {best_hyperparams}')

Best test set accuracy:
	 0.6747967479674797
Achieved with hyperparameters:
	 {'classifier': LogisticRegression(), 'features__pca__n_components': 5, 'features__select_best__k': 6}


In [22]:
import pickle
pickle.dump( grid, open( "model.pkl", "wb" ) )

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest

# numeric columns = ['sex', 'smoker', 'day', 'time']
numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean'))])

# catergorical_columns = ['sex', 'smoker', 'day', 'time']

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                  ('one-hot-encode', OneHotEncoder(sparse=False))])

Feature_Transformation = ColumnTransformer([('numeric', numeric_transform, [Numeric_Columns]), 
                                        ('categorical', categorical_transform, catergorical_columns)])


# pipeline_tips = Pipeline([('preprocessing', Feature_Transformation), 
#                           ('model', LinearRegression())])

# pipeline_tips.fit(X_train_tips, y_train_tips)

# r2 = pipeline_tips.score(X_test_tips, y_test_tips)
# print(f'Test set r^2: {r2}')

---
## Hyperparameter tuning with pipelines
Normally, if we want to tune hyperparameters using something like `GridSearchCV`, we need to pass it:
1. A model object.
2. A dictionary of (parameter name, list of values to try) pairs.

When not using pipelines, we can only tune hyperparameters for a single model (the one we specify as the
model in `GridSearchCV`. As we've seen, however, we can create composite models using `Pipeline`. We can
then pass this composite model to `GridSearchCV` and tune hyperparameters for multiple components at once.

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

feature_union = FeatureUnion([('pca', PCA()), 
                              ('select_best', SelectKBest())])

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', RidgeClassifier())])

# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__alpha': [0.001, 0.01, 0.1], 
              'features__pca__n_components': [3, 5],
              'features__select_best__k': [1, 3, 6]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy:\n\t {best_acc}\nAchieved with hyperparameters:\n\t {best_hyperparams}')

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[('preprocessing', Feature_Transformation),
                           ('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', 'passthrough')]) #LogisticRegression())])


# Find the best hyperparameters and model using GridSearchCV on the train set
param_grid = {'classifier': [RandomForestClassifier(), LogisticRegression(), SVC()],    # Which is better, Logistic Regression or an SVM Classifier?
              'features__pca__n_components': [3, 5],
              'features__select_best__k': [1, 3, 6]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy:\n\t {best_acc}\nAchieved with hyperparameters:\n\t {best_hyperparams}')