<a href="https://colab.research.google.com/github/Mraghuvaran/Regreession-models/blob/master/income_classification_pipelining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np 
import pandas as pd 

In [0]:
train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Income Dataset/train_data.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Income Dataset/test_data.csv")

In [0]:
train.head()

In [0]:
test.head()

In [0]:
train.gain.value_counts()

In [0]:
#dropping the columns which are not useful 
train.drop(columns= ['index','tax_paid'], axis=1, inplace=True)


#test data columns removal 

test.drop(columns= ['index','tax_paid'], axis=1, inplace=True)



In [0]:
train.shape

In [0]:
test.shape

In [0]:
train.dtypes

In [0]:
#converting objects to categories 
for col in ['working_sector','qualification','years_of_education','loan_taken','marital_status','occupation','relationship','ethnicity','gender',
            'country','target']:
  train[col] = train[col].astype('category')


In [0]:
cat_attr = list(train.select_dtypes("category").columns)
num_attr = list(train.columns.difference(cat_attr))

cat_attr.pop()

In [0]:
cat_attr

In [0]:
num_attr

In [0]:
# print columns with missing values
missing_cols = train.columns[train.isnull().any()]
print(missing_cols)

In [0]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score

import warnings
warnings.filterwarnings('ignore')

In [0]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)])

In [0]:
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [0]:
#splitting the dat into train & validation data 
X_train, Y_train = train.loc[:,train.columns!='target'], train.loc[:,'target']

X_test = test

In [0]:
#splitting the train data into validation part. 
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, random_state=12, stratify =Y_train)

In [0]:
### _Build Logistic Regression Model - 1_

clf_logreg.fit(x_train, y_train)

In [0]:
train_pred = clf_logreg.predict(x_train)
test_pred = clf_logreg.predict(x_test)

print(accuracy_score(y_true= y_train, y_pred=train_pred))
print(accuracy_score(y_true=y_test, y_pred = test_pred))

In [0]:
### _Build Decision Tree Model - 2_

%%time
clf_dt = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

dt_param_grid = {'classifier__criterion': ['entropy', 'gini'], 'classifier__max_depth': [6,8,10,12], 
                 "classifier__min_samples_split": [2, 10, 20],"classifier__min_samples_leaf": [1, 5, 10]}

dt_grid = GridSearchCV(clf_dt, param_grid=dt_param_grid, cv=5)

dt_grid.fit(x_train,y_train)

print(dt_grid.best_params_)

train_pred = dt_grid.predict(x_train)
test_pred = dt_grid.predict(x_test)

print(dt_grid.score(x_train, y_train))
print(dt_grid.score(x_test, y_test))


In [0]:
### _Build Random Forest Model - 3_ (Using Stratified KFold)

###__Stratified K-Folds cross-validator__

## This cross-validation object is a **variation** of KFold that returns stratified folds. The folds are made by **preserving the percentage of samples for each class**.

%%time
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"classifier__n_estimators" : [150, 250, 300],
              "classifier__max_depth" : [5,8,10],
              "classifier__max_features" : [3, 5, 7],
              "classifier__min_samples_leaf" : [4, 6, 8, 10]}

rf_grid = GridSearchCV(clf, param_grid=dt_param_grid, cv=kfold)


rf_grid.fit(x_train,y_train)

rf_grid.best_params_

train_pred = rf_grid.predict(x_train)
test_pred = rf_grid.predict(x_test)

print(rf_grid.score(x_train, y_train))
print(rf_grid.score(x_test, y_test))



In [0]:
### _Build Gradient Boosting - 4_
%%time
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('GBM',GradientBoostingClassifier())])

gbm_param_grid = {'GBM__max_depth': [8,10,12,14], 'GBM__subsample': [0.8, 0.6,], 'GBM__max_features':[0.2, 0.3], 
              'GBM__n_estimators': [10, 20, 30]}

gbm_grid = GridSearchCV(clf, param_grid=gbm_param_grid, cv=3)

gbm_grid.fit(x_train,y_train)

print(gbm_grid.best_params_)

train_pred = gbm_grid.predict(x_train)
test_pred = gbm_grid.predict(x_test)

print(gbm_grid.score(x_train, y_train))
print(gbm_grid.score(x_test, y_test))


In [0]:
final_pred = gbm_grid.predict(X_test)

In [0]:
final_pred

In [0]:

final_pred = pd.Series(final_pred)

In [0]:
final_pred.rename(columns= {"0":"target"})

In [0]:
final_results = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Income Dataset/SampleSubmission.csv")

In [0]:
final_results.head()

In [0]:
final_results['target']=final_pred
#final_submission = pd.concat(final_results, final_pred, join="inner")

In [0]:
final_results.head()

In [0]:
final_results.to_csv("FinalSubmission.csv", index= False)