In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as st
import warnings 
warnings.filterwarnings("ignore")
sns.set(rc={"figure.figsize":(15,6)})
pd.pandas.set_option("display.max_columns",None)

In [3]:
data=pd.read_csv("C:\\Users\\Vozon Comsof Pvt Ltd\\ML-E2E-PipelineProject\\ML_Pipeline_Project\\ML_Pipeline_Project_E-2-E\\Machine_Learning_project.egg-info\\Data\\cleandata.csv")

In [4]:
data

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
0,90,3,9,6,9,1,4,0,0,4356,40,0
1,82,3,9,6,3,1,4,0,0,4356,18,0
2,66,3,10,6,9,4,2,0,0,4356,40,0
3,54,3,4,0,6,4,4,0,0,3900,40,0
4,41,3,10,5,9,3,4,0,0,3900,40,0
...,...,...,...,...,...,...,...,...,...,...,...,...
32532,22,3,10,4,10,1,4,1,0,0,40,0
32533,27,3,12,2,12,5,4,0,0,0,38,0
32534,40,3,9,2,6,0,4,1,0,0,40,1
32535,58,3,9,6,0,4,4,0,0,0,40,0


In [5]:
data.shape

(32537, 12)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
#pipline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [7]:
# Saprate dependent and indipendent features
x = data.drop("income",axis=1)
y = data["income"]

In [9]:
categorical_features = x.select_dtypes(include="object").columns
numerical_features = x.select_dtypes(exclude="object").columns
print(categorical_features)
print(numerical_features)

Index([], dtype='object')
Index(['age', 'workclass', 'education_num', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


In [10]:
num_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]

) # to standardize the data values into a standard format.

cato_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("scaler",StandardScaler())
    ]

)

# apply transform to particular columns. 
preprocessor = ColumnTransformer([
    ("num_pipline",num_pipline,numerical_features)
])

In [11]:
# Train test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26029, 11)
(6508, 11)
(26029,)
(6508,)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report,ConfusionMatrixDisplay

In [16]:
def performance_metrices(test,predict):
    confusionmetrix = print(f"CONFUSION_METRIX: {confusion_matrix(test,predict)}")
    
    precision = print(f"PRECISION_SCORE: {precision_score(test,predict)}")
    
    recall = print(f"RECALL_SCORE: {recall_score(test,predict)}")
    
    f1score = print(f"F1_SCORE: {f1_score(test,predict)}")
    
    accuracyscore = print(f"ACCURACY_SCORE: {accuracy_score(test,predict)}")
    
    return confusionmetrix,precision,recall,f1score,accuracyscore
    

# USING HYPERPARAMETER TUNING ON ALGORITHMS

In [17]:
lr = LogisticRegression()

In [18]:
param = {
    "class_weight":["balanced"],
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
    
}

In [20]:
grid = GridSearchCV(estimator=lr,param_grid=param,cv=5,n_jobs=-1, verbose=3)

In [21]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [22]:
grid.best_params_

{'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}

In [23]:
grid.score(X_train,y_train)

0.7794383187982635

In [24]:
y_prad = grid.predict(X_test)

In [25]:
accuracy_score(y_test,y_prad)

0.7811923786109404

In [26]:
dr = DecisionTreeClassifier()

In [27]:
param_grid = {
    "class_weight":["balanced"],
    "criterion":['gini',"entropy","log_loss"],
    "splitter":['best','random'],
    "max_depth":[3,4,5,6],
    "min_samples_split":[2,3,4,5],
    "min_samples_leaf":[1,2,3],
    "max_features":["auto","sqrt","log2"]
}

In [28]:
grid_search = GridSearchCV(estimator=dr,param_grid=param_grid,cv=5,scoring="accuracy")

In [29]:
grid_search.fit(X_train,y_train)

In [30]:
grid_search.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'splitter': 'best'}

In [31]:
grid_search.score(X_train,y_train)

0.7360252026585731

In [32]:
y_prad = grid_search.predict(X_test)

In [33]:
accuracy_score(y_test,y_prad)

0.7241856177012908

In [34]:
print(classification_report(y_test,y_prad))

              precision    recall  f1-score   support

           0       0.96      0.67      0.79      4988
           1       0.45      0.91      0.61      1520

    accuracy                           0.72      6508
   macro avg       0.71      0.79      0.70      6508
weighted avg       0.84      0.72      0.75      6508



In [35]:
rf = RandomForestClassifier()

In [36]:
param_grid = {
    "class_weight":["balanced"],
    'n_estimators': [20, 50, 30],
    'max_depth': [10, 8, 5],
    'min_samples_split': [2, 5, 10],
}

In [37]:
grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring="accuracy")

In [38]:
grid_search.fit(X_train,y_train)

In [39]:
grid_search.best_params_

{'class_weight': 'balanced',
 'max_depth': 10,
 'min_samples_split': 5,
 'n_estimators': 30}

In [40]:
grid_search.score(X_train,y_train)

0.8212378500902839

In [41]:
y_pred = grid_search.predict(X_test)

In [42]:
accuracy_score(y_test,y_pred)

0.8048555623847572

In [43]:
print(classification_report(y_test,y_prad))

              precision    recall  f1-score   support

           0       0.96      0.67      0.79      4988
           1       0.45      0.91      0.61      1520

    accuracy                           0.72      6508
   macro avg       0.71      0.79      0.70      6508
weighted avg       0.84      0.72      0.75      6508

