<a href="https://colab.research.google.com/github/Mraghuvaran/Regreession-models/blob/master/Pipelining_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-Processing

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### Converting objects to categories 

In [0]:
for col in [#pass the list of columns requred to covert into category]:
  train[col] = train[col].astype('category')


In [0]:
cat_attr = list(train.select_dtypes("category").columns) #exclude target column inthe list
num_attr = list(train.columns.difference(cat_attr))

# cat_attr.pop()

### Columns with missing values

In [0]:
missing_cols = train.columns[train.isnull().any()]
print(missing_cols)

### Importing the Required packages

In [0]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score


import warnings
warnings.filterwarnings('ignore')

### Initiating the pipeline

In [0]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #change to mode if there are no outliars.
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)])

In [0]:
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

### Splitting the data - Dependent & Independent Variables


In [0]:
X,y = 

X_test=   test_data

In [0]:
#Splitting the data into Train & Validation data

x_train, x_val, y_train, y_val = train_test_split(X,y, random_state=12, stratify =y)

# Model Building

## Model-1 -- Logistic Regression Model 




In [0]:
clf_logreg.fit(x_train, y_train)

train_pred_LR = clf_logreg.predict(x_train)
val_pred_LR = clf_logreg.predict(x_val)

print("Train accuarcy: {} ." .format(accuracy_score(y_true= y_train, y_pred=train_pred_LR)))
print("Validation Accuracy: {}. " .format(accuracy_score(y_true=y_test, y_pred = val_pred_LR)))

## Model-2 --Decision Tree Model


In [0]:

%%time
clf_dt = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

dt_param_grid = {'classifier__criterion': ['entropy', 'gini'],
                 'classifier__max_depth': [5,6,7,8,9,10],
                 "classifier__min_samples_split": [2,5],
                 "classifier__min_samples_leaf": [1,3,5]}

dt_grid = GridSearchCV(clf_dt, param_grid=dt_param_grid, n_jobs=-1, cv=5)

dt_grid.fit(x_train,y_train)

print(dt_grid.best_params_)

train_pred = dt_grid.predict(x_train)
test_pred = dt_grid.predict(x_test)

print("Train Accuracy: {} ." .format(dt_grid.accuracy_score(x_train, y_train)))
print("Validation Accuracy: {} ." .format(dt_grid.accuracy_score(x_test, y_test)))


## Model-3 --Build Random Forest Model (Using Stratified KFold)

### __Stratified K-Folds cross-validator__

#### This cross-validation object is a **variation** of KFold that returns stratified folds. The folds are made by **preserving the percentage of samples for each class**.


In [0]:
%%time
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"classifier__n_estimators" : [30,40,50],
              "classifier__max_depth" : [2,3,5,6],
              "classifier__max_features" : [3, 5, 7],
              "classifier__min_samples_leaf" : [4, 6, 8, 10]}

rf_grid = GridSearchCV(clf, param_grid=dt_param_grid, cv=kfold)


rf_grid.fit(x_train,y_train)

print(rf_grid.best_params_)

train_pred = rf_grid.predict(x_train)
test_pred = rf_grid.predict(x_test)

print("Train Accuarcy: {} ." .format(rf_grid.accuracy_score(x_train, y_train)))
print("Validation accuracy: {} ." .format(rf_grid.accuracy_score(x_test, y_test)))



## Model-4 --XGboosting 

In [0]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [0]:

%%time
clf = Pipeline(steps=[('preprocessor', preprocessor),
               ('classifier', xgb.XGBClassifier())])


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"classifier__n_estimators" : [70,100],
              "classifier__max_depth" : [3,5,6,7],
              "classifier__colsample_bytree":[0.7,.8],
              "classifier__learning_rate": [0.001,0.01,0.1],
              "classifier__subsample":[0.8,0.6]}

xg_grid = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, cv=kfold)


xg_grid.fit(x_train,y_train)

print(xg_grid.best_params_)

train_pred = xg_grid.predict(x_train)
test_pred = xg_grid.predict(x_val)

print("Train Accuracy: {}." .format(xg_grid.accuracy_score(x_train, y_train)))
print("Validation Accuracy: {}." .format(xg_grid.accuracy_score(x_val, y_val)))


## Model-5 --Build Gradient Boosting 

In [0]:

%%time
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('GBM',GradientBoostingClassifier())])

gbm_param_grid = {'GBM__max_depth': [2,3,4],
                  'GBM__subsample': [0.8, 0.6,],
                  'GBM__max_features':[0.2, 0.3], 
                  'GBM__n_estimators': [10, 20, 30]}

gbm_grid = GridSearchCV(clf, param_grid=gbm_param_grid, n_jobs =-1, cv=5)

gbm_grid.fit(x_train,y_train)

print("The Best parameters are: {} .".format(gbm_grid.best_params_))

train_pred = gbm_grid.predict(x_train)
test_pred = gbm_grid.predict(x_val)

print(gbm_grid.score(x_train, y_train))
print(gbm_grid.score(x_val, y_val))


{'GBM__max_depth': 4, 'GBM__max_features': 0.2, 'GBM__n_estimators': 30, 'GBM__subsample': 0.8}
0.8593077247783875
0.8555147524376345
CPU times: user 1min 52s, sys: 459 ms, total: 1min 53s
Wall time: 1min 53s


## Predicting on test Data & storing values.

In [0]:
best_model_pred = best_model.predict(X_test)

print(best_model_pred.shape)

sample_sub_file = pd.read_csv("#paste the path of the submission file")

sample_sub_file['target'] = best_model_pred

sample_sub_file.to_csv("final_pred1.csv", index=False)