In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv(r"C:\Users\khushi pawar\Downloads\train.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df=df.drop(columns=['PassengerId','SibSp','Parch','Ticket','Cabin'])	

In [6]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
Fare          0
Embarked      2
dtype: int64

In [7]:
X = df.drop(columns=['Survived'])
Y = df['Survived']

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [9]:
num_col=['Age','Fare']
num_trf=Pipeline(steps=[
    ('trf1',SimpleImputer(strategy='median')),
     ('trf2',StandardScaler())
     ])

In [10]:
cat_col=['Embarked','Sex']
cat_trf=Pipeline(steps=[
    ('trf3',SimpleImputer(strategy='most__frequent')),
    ('trf4',OneHotEncoder(handle_unknown='ignore'))
])

In [27]:
col_trf=ColumnTransformer([
    ('num_trf',num_trf,num_col),
    ('cat_trf',cat_trf,cat_col)
])

In [28]:
clf=Pipeline(steps=[
    ('col_trf',col_trf),
    ('classifier',LogisticRegression())
])

In [29]:
param_grid = {
    'col_trf__num_trf__trf1__strategy': ['mean', 'median'],   # numeric imputer
    'col_trf__cat_trf__trf3__strategy': ['most_frequent', 'constant'],  # categorical imputer
    'classifier__C': [0.1, 1.0, 10, 100] 
}



grid_search = GridSearchCV(clf, param_grid, cv=10)

In [30]:
grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'classifier__C': [0.1, 1.0, ...], 'col_trf__cat_trf__trf3__strategy': ['most_frequent', 'constant'], 'col_trf__num_trf__trf1__strategy': ['mean', 'median']}"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num_trf', ...), ('cat_trf', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [31]:
print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 0.1, 'col_trf__cat_trf__trf3__strategy': 'most_frequent', 'col_trf__num_trf__trf1__strategy': 'mean'}


In [32]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.784


In [34]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
#cv_results[['param_classifier__C','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__imputer__strategy','mean_test_score']]
cv_results[['param_classifier__C',
            'param_col_trf__cat_trf__trf3__strategy',
            'param_col_trf__num_trf__trf1__strategy',
            'mean_test_score']]


Unnamed: 0,param_classifier__C,param_col_trf__cat_trf__trf3__strategy,param_col_trf__num_trf__trf1__strategy,mean_test_score
0,0.1,most_frequent,mean,0.783725
1,0.1,most_frequent,median,0.783725
2,0.1,constant,mean,0.783725
3,0.1,constant,median,0.783725
4,1.0,most_frequent,mean,0.782316
5,1.0,most_frequent,median,0.782316
6,1.0,constant,mean,0.782316
7,1.0,constant,median,0.782316
8,10.0,most_frequent,mean,0.782316
9,10.0,most_frequent,median,0.782316
