# Code 6
- Preprocessing Improvement
- Decision Tree

## 1/ Import Libraries

In [3]:
#CodeSection1
from google.colab import drive
drive.mount('/mntDrive') 

Drive already mounted at /mntDrive; to attempt to forcibly remount, call drive.mount("/mntDrive", force_remount=True).


In [4]:
#CodeSection2
import pandas as pd
import numpy as np

## 2/ Import Data

In [43]:
#CodeSection3
train = pd.read_csv('/mntDrive/MyDrive/_Data Science/2 Cross Sell Prediction (Cohort)/input/train.csv')
test = pd.read_csv('/mntDrive/MyDrive/_Data Science/2 Cross Sell Prediction (Cohort)/input/test.csv')
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


## 3/ Combine both Datasets (train, test)

In [44]:
#CodeSection4
all_data = [train, test]

### Check Combined Data 
- Missing Value
- Data Types

In [45]:
#CodeSection5
# Combine both to check Missing Value
pd.concat([train, test], axis = 0).isnull().sum()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


## 4/ Preprocessing Improvement

### 4.1/ Identify Numerical and Categorical Features

In [46]:
#CodeSection6
# Identify all Numerical and Categorical features
numerical_features = ['Age', 'Annual_Premium', 'Vintage']
categorical_features = ['Gender','Driving_License', 'Region_Code', 'Previously_Insured','Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']

### 4.2/ Outlier Strategy

In [47]:
#CodeSection7

# Write a loop to do the same
for num_var in numerical_features:
  Q1 = train[num_var].quantile(0.25)
  Q3 = train[num_var].quantile(0.75)

  IQR = Q3-Q1

  Lower_Whisker = Q1 - 1.5*IQR
  Upper_Whisker = Q3 + 1.5*IQR

  train[num_var] = train[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)
  test[num_var] = test[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)

### 4.3/ Import Libraries
- Missing Value Imputation - SimpleImputer
- Preprocessing - StandardScaler, OrdinalEncoder
- Pipeline - make_pipeline, make_column_transformer
- Model - Decision Tree

In [48]:
#CodeSection8

# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Import StandardScaler
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Import Decision tree
from sklearn.tree import DecisionTreeClassifier

### 4.4/ Build Pipeline

In [49]:
#CodeSection9
# Create Preprocessor Pipeline
preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy = 'median'),
                   StandardScaler()),numerical_features),
     (make_pipeline(SimpleImputer(strategy = 'most_frequent'),
                    OrdinalEncoder()),categorical_features))

### 4.5/ Divide Data into X and y

In [61]:
#CodeSection10
X = train.drop(['Response','id'], axis =  1)
y = train['Response']

## 5/ Build Model and Fit

In [62]:
#CodeSection11
# Create Model Pipeline and Initiate Model
model = make_pipeline(preprocessor,DecisionTreeClassifier(max_depth=5))

In [63]:
#CodeSection12 
# Fit Model
model.fit(X,y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

## 6/ Check Accuracy of Model on Train Data

In [53]:
#CodeSection13
# Predict on Train Data
y_pred = model.predict(X)

### We can use a Accuracy Function from Metrics
- Check Train Accuracy

In [54]:
#CodeSection14
# Import metrics library
from sklearn.metrics import accuracy_score

In [55]:
#CodeSection15
# get Actual "y" variables (use "y_true" as variable)
y_true = train['Response']

In [56]:
#CodeSection16
# Print Train Accuracy
print(f" Train Accuracy : {accuracy_score(y_true, y_pred):0.1%}")

 Train Accuracy : 87.7%


## 7/ Predict and Submission

### Predict on "Test Data"

In [57]:
#CodeSection17
# Get all the X Variables from the Test Dataset
X_test = test.drop(['id'], axis =  1)

In [58]:
#CodeSection18
# Predict on X_test Data ("X_test_prep")
X_test_prep = model.predict(X_test)

ValueError: ignored

### Create Submission File

In [None]:
#CodeSection19
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

### Export Submission File

In [None]:
#CodeSection20
submission.to_csv('/mntDrive/MyDrive/Project MasterMind/1 - Planning Stage/2- Research/6- New Teaching Method/Loan Prediction (Webinar Flow)/Solution Codes/output/O6_DT_Pipeline.csv', index = False)

In [None]:
# LB Accuracy : 0.6805