In [341]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
df=pd.read_csv('data/adult.csv')
(df.head(2))

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K


In [342]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', nan, ' Without-pay',
       ' Never-worked'], dtype=object)

In [335]:
#seperate independent and dependent feature
X = df.drop(labels='salary',axis=1)
y = df['salary']


In [336]:
categorical_features = X.select_dtypes(include='object').columns
numerical_features = X.select_dtypes(exclude='object').columns
print(categorical_features)
print(numerical_features)

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')
Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [331]:
from sklearn.impute import SimpleImputer #for all the missing values
from sklearn.preprocessing import StandardScaler #Feature scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder #Oridnal Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [337]:
numerical_pipeline = Pipeline([
    ('imputer',SimpleImputer(missing_values='Nan',strategy='median')),
    ('scaler',StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
    ('ordinal', OrdinalEncoder()),
    #('onehotencode', OneHotEncoder(sparse=False,handle_unknown="ignore")),
    ('scaler',StandardScaler())
])

preprocessor = ColumnTransformer([  
    ('numerical_pipeline',numerical_pipeline,numerical_features),
    ('categorical_pipeline',categorical_pipeline,categorical_features)
])

In [338]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=42)

In [339]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26048, 14)
(6513, 14)
(26048,)
(6513,)


In [340]:
X_train.fillna(X_train.mean(), inplace=True)

TypeError: can only concatenate str (not "int") to str

In [324]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

ValueError: Input X contains NaN.
SimpleImputer does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [325]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
5514,33.0,Local-gov,198183.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,Female,0.0,0.0,50.0,United-States
19777,36.0,Private,86459.0,Assoc-voc,11.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,1887.0,50.0,United-States
10781,58.0,Self-emp-not-inc,203039.0,9th,5.0,Separated,Craft-repair,Not-in-family,White,Male,0.0,0.0,40.0,United-States
32240,21.0,Private,180190.0,Assoc-voc,11.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,46.0,United-States
9876,27.0,Private,279872.0,Some-college,10.0,Divorced,Other-service,Not-in-family,White,Male,0.0,0.0,40.0,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,47.0,Private,359461.0,Bachelors,13.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States
5390,31.0,Private,147215.0,12th,8.0,Divorced,Other-service,Unmarried,White,Female,0.0,0.0,21.0,United-States
860,18.0,Private,216284.0,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,20.0,United-States
15795,50.0,Self-emp-not-inc,54261.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,84.0,United-States


In [311]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sourcee.logger import logging
from sourcee.exception import CustomException
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [312]:
def evaluate_models(X_train, y_train,X_test,y_test,models,param):
        report = {}
        for i in range(len(list(models))):
            model = list(models.values())[i]
            para = param[list(models.keys())[i]]

            
            #Model Traning
            gs = GridSearchCV(estimator=model,param_grid=para,cv=5)
            gs.fit(X_train,y_train)
            model(**gs.best_params_).fit(X_train,y_train)
            model.fit(X_train,y_train)

            #make Prediction
            y_test_pred = model.predict(X_test)

            test_model_score = accuracy_score(y_test,y_test_pred)
            
            report[list(models.keys())[i]] = test_model_score

        return report

In [313]:
models = {
                "Random Forest": RandomForestClassifier(),
                "Decision Tree": DecisionTreeClassifier(),
                "Logistic":LogisticRegression()
            }

In [314]:
params = {
                "Random Forest":{
                    "class_weight":["balanced"],
                    'n_estimators': [20, 50, 30],
                    'max_depth': [10, 8, 5],
                    'min_samples_split': [2, 5, 10],
                },
                "Decision Tree":{
                    "class_weight":["balanced"],
                    "criterion":['gini',"entropy","log_loss"],
                    "splitter":['best','random'],
                    "max_depth":[3,4,5,6],
                    "min_samples_split":[2,3,4,5],
                    "min_samples_leaf":[1,2,3],
                    "max_features":["auto","sqrt","log2"]
                },
                "Logistic":{
                    "class_weight":["balanced"],
                    'penalty': ['l1', 'l2'],
                    'C': [0.001, 0.01, 0.1, 1, 10, 100],
                    'solver': ['liblinear', 'saga']
                }
            }

In [315]:
evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,models=models,param=params)

TypeError: '<' not supported between instances of 'str' and 'float'