# SkLearn Pipelines with Cross Validation Techniques.

# import classes and libraries

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

# Loading Data and initial checks

In [49]:
df = pd.read_csv('datasets/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [50]:
df.info

<bound method DataFrame.info of        age workclass  fnlwgt  ... hours.per.week  native.country income
0       90         ?   77053  ...             40   United-States  <=50K
1       82   Private  132870  ...             18   United-States  <=50K
2       66         ?  186061  ...             40   United-States  <=50K
3       54   Private  140359  ...             40   United-States  <=50K
4       41   Private  264663  ...             40   United-States  <=50K
...    ...       ...     ...  ...            ...             ...    ...
32556   22   Private  310152  ...             40   United-States  <=50K
32557   27   Private  257302  ...             38   United-States  <=50K
32558   40   Private  154374  ...             40   United-States   >50K
32559   58   Private  151910  ...             40   United-States  <=50K
32560   22   Private  201490  ...             20   United-States  <=50K

[32561 rows x 15 columns]>

# Split into categorical and numerical

In [51]:
num_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
cat_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Categorical Pipeline

In [52]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

# Numerical Pipeline

In [53]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

# Combined Preprocessing pipeline

In [62]:
preprocessor = ColumnTransformer(
    transformers=[("num", num_pipeline, num_features),
                  ("cat", cat_pipeline, cat_features)
                  ]
)

# Split the dataset

In [55]:
x = df.drop(['income'], axis=1)
y = df.income

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# Get the models trained

In [56]:
log_res = LogisticRegression(class_weight="balanced", max_iter=10000)
rf = RandomForestClassifier(class_weight="balanced", n_estimators=100, random_state=42)

# Final pipelines

In [57]:
log_res_pipeline = make_pipeline(
    preprocessor,
    log_res
)

rf_pipeline = make_pipeline(
    preprocessor,
    rf
)

log_res_pipeline.fit(x_train, y_train)
y_log_res_pred = log_res_pipeline.predict(x_val)

rf_pipeline.fit(x_train, y_train)
y_rf_pred = rf_pipeline.predict(x_val)

# Evaluate

In [58]:
print("Logistic Regression: ")
print(classification_report(y_val, y_log_res_pred))

print("Random Forest: ")
print(classification_report(y_val, y_rf_pred))

Logistic Regression: 
              precision    recall  f1-score   support

       <=50K       0.94      0.80      0.86      3708
        >50K       0.57      0.84      0.68      1176

    accuracy                           0.81      4884
   macro avg       0.75      0.82      0.77      4884
weighted avg       0.85      0.81      0.82      4884

Random Forest: 
              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      3708
        >50K       0.72      0.60      0.65      1176

    accuracy                           0.85      4884
   macro avg       0.80      0.76      0.78      4884
weighted avg       0.84      0.85      0.84      4884



# Cross Validation

In [59]:
from sklearn.model_selection import StratifiedKFold, cross_validate
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "logistic regression": log_res_pipeline,
    "random forest": rf_pipeline
}

for name, model in models.items():
    scores = cross_validate(
        model,
        x_train,
        y_train,
        cv=cv,
        scoring=['accuracy', 'f1_macro'],
        return_train_score=True
    )

    print(f"{name} mean training accuracy: ", np.mean(scores['train_accuracy']))
    print(f"{name} mean validation accuracy: ", np.mean(scores['test_accuracy']))
    print(f"{name} mean training F1: ", np.mean(scores['train_f1_macro']))
    print(f"{name} mean training F1: ", np.mean(scores['test_f1_macro']))

logistic regression mean training accuracy:  0.8151653987958305
logistic regression mean validation accuracy:  0.8139699080679899
logistic regression mean training F1:  0.7791624108132615
logistic regression mean training F1:  0.7776315643555515
random forest mean training accuracy:  0.9999780629593069
random forest mean validation accuracy:  0.8544221773305983
random forest mean training F1:  0.9999700022764229
random forest mean training F1:  0.7885223434310572
