In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform,loguniform,randint


from sklearn.metrics import accuracy_score,confusion_matrix 
from sklearn.feature_extraction import grid_to_graph
from sklearn.model_selection import train_test_split

# Data Exploration

In [161]:
df=pd.read_csv('Insurance_Train.csv')
df_test=pd.read_csv('Insurance_Test.csv')
df_train=df # training Dataset
df_train.head()

Unnamed: 0,id,age,job,marital,education,balance,housing,loan,contact,month,duration,campaign,pdays,previous,poutcome,y
0,98749,32,admin.,single,secondary,64,yes,no,unknown,may,202,2,-1,0,unknown,0
1,19550,45,blue-collar,married,secondary,534,no,no,cellular,aug,104,6,-1,0,unknown,0
2,75084,45,technician,married,secondary,1477,yes,no,cellular,nov,75,1,132,1,failure,0
3,65715,39,technician,married,tertiary,14,no,no,cellular,jan,114,2,-1,0,unknown,0
4,41412,49,blue-collar,single,unknown,2222,no,no,unknown,jun,114,2,-1,0,unknown,0


In [162]:
df_train.isna().sum() #training dataset

id           0
age          0
job          0
marital      0
education    0
balance      0
housing      0
loan         0
contact      0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# summary of the numerical features

In [163]:
df_train.describe() #training Dataset

Unnamed: 0,id,age,balance,duration,campaign,pdays,previous,y
count,40689.0,40689.0,40689.0,40689.0,40689.0,40689.0,40689.0,40689.0
mean,54899.530438,40.944555,1362.242621,257.943744,2.764457,40.340141,0.582344,0.116985
std,25930.988287,10.625554,3044.786399,257.888397,3.100915,100.247601,2.340269,0.321406
min,10000.0,18.0,-8019.0,0.0,1.0,-1.0,0.0,0.0
25%,32429.0,33.0,73.0,103.0,1.0,-1.0,0.0,0.0
50%,54845.0,39.0,450.0,180.0,2.0,-1.0,0.0,0.0
75%,77361.0,48.0,1430.0,319.0,3.0,-1.0,0.0,0.0
max,99998.0,95.0,102127.0,4918.0,63.0,871.0,275.0,1.0


 # checking the info of datasets

In [164]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40689 entries, 0 to 40688
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         40689 non-null  int64 
 1   age        40689 non-null  int64 
 2   job        40689 non-null  object
 3   marital    40689 non-null  object
 4   education  40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   month      40689 non-null  object
 10  duration   40689 non-null  int64 
 11  campaign   40689 non-null  int64 
 12  pdays      40689 non-null  int64 
 13  previous   40689 non-null  int64 
 14  poutcome   40689 non-null  object
 15  y          40689 non-null  int64 
dtypes: int64(8), object(8)
memory usage: 5.0+ MB


In [165]:
df_train_upd=df_train
df_test_upd=df_test

In [166]:
# Train Data Encoding
#1
education_map = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df_train_upd['education'] = df_train['education'].map(education_map)
#2
housing_map = {'no': 0, 'yes': 1}
df_train_upd['housing'] = df_train['housing'].map(housing_map)
#3
job_map = {'admin.':8 ,'blue-collar':4, 'entrepreneur':9, 'housemaid':0, 'management':10, 'retired':3,
 'self-employed':7, 'services':6, 'student':1, 'technician':5, 'unemployed':2, 'unknown':4}
df_train_upd['job'] = df_train['job'].map(job_map)

# Test Data encoding
#1
education_map = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df_test_upd['education'] = df_test['education'].map(education_map)
#2
housing_map = {'no': 0, 'yes': 1}
df_test_upd['housing'] = df_test['housing'].map(housing_map)
#3

job_map = {'admin.':8 ,'blue-collar':4, 'entrepreneur':9, 'housemaid':0, 'management':10, 'retired':3,
 'self-employed':7, 'services':6, 'student':1, 'technician':5, 'unemployed':2, 'unknown':4}
df_test_upd['job'] = df_test['job'].map(job_map)

# convert categorical to dummies variable

In [167]:
# train dataset preprocessing

#df_train_upd=df_train_upd.drop('month',axis=1)
df_train_upd=df_train_upd.drop('id',axis=1) 
df_train_upd=df_train_upd.drop('contact',axis=1)

# test dataset preprocessing

#df_test_upd=df_test_upd.drop('month',axis=1)
df_test_upd=df_test_upd.drop('id',axis=1)
df_test_upd=df_test_upd.drop('contact',axis=1)

In [168]:
# train dataset preprocessing

df_train_upd=df_train_upd.drop('pdays',axis=1)
df_train_upd=df_train_upd.drop('previous',axis=1)
df_train_upd=df_train_upd.drop('age',axis=1)
df_train_dummies=pd.get_dummies(df_train_upd)

# test dataset preprocessing

df_test_upd=df_test_upd.drop('pdays',axis=1)
df_test_upd=df_test_upd.drop('previous',axis=1)
df_test_upd=df_test_upd.drop('age',axis=1)
df_test_dummies=pd.get_dummies(df_test_upd)

#saving converted dummies in csv format
df_train_dummies.to_csv('dummies_insurance_train.csv')
df_test_dummies.to_csv('dummies_insurance_test.csv')

In [169]:
df_train_dummies.corr()

Unnamed: 0,job,education,balance,housing,duration,campaign,y,marital_divorced,marital_married,marital_single,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
job,1.0,0.454578,0.035405,0.02208,-0.014566,0.013941,0.00082,0.020972,-0.041826,0.030588,...,-0.039327,0.002781,-0.035402,0.06916,-0.010538,-0.001967,0.015392,0.004124,0.000343,-0.01467
education,0.454578,1.0,0.049862,-0.039294,0.003655,-0.001541,0.053136,0.006134,-0.126864,0.133552,...,-0.065282,0.023366,-0.082046,0.057322,0.013492,0.010294,0.018615,0.010134,0.037691,-0.03772
balance,0.035405,0.049862,1.0,-0.069109,0.024491,-0.014454,0.056482,-0.023348,0.02643,-0.012166,...,0.029875,0.022864,-0.070361,0.116226,0.044607,0.024244,0.009764,0.010368,0.036505,-0.030152
housing,0.02208,-0.039294,-0.069109,1.0,0.002099,-0.024963,-0.141179,0.001243,0.0178,-0.020231,...,-0.103807,-0.066422,0.428748,-0.00104,-0.085738,-0.074987,0.110992,0.039428,-0.091481,-0.066969
duration,-0.014566,0.003655,0.024491,0.002099,1.0,-0.085787,0.394121,0.005378,-0.019908,0.017825,...,-0.020223,-0.003561,0.005846,-0.006279,0.014723,0.016101,-0.017753,-0.002439,0.043938,-0.004906
campaign,0.013941,-0.001541,-0.014454,-0.024963,-0.085787,1.0,-0.074147,-0.019212,0.031216,-0.020303,...,0.044402,-0.017377,-0.067862,-0.085269,-0.052,-0.035936,-0.088622,-0.02213,-0.057886,0.109671
y,0.00082,0.053136,0.056482,-0.141179,0.394121,-0.074147,1.0,0.000927,-0.057049,0.061355,...,-0.015139,0.128353,-0.103527,-0.015597,0.132329,0.120122,0.008335,0.026792,0.309403,-0.164443
marital_divorced,0.020972,0.006134,-0.023348,0.001243,0.005378,-0.019212,0.000927,1.0,-0.444064,-0.226709,...,0.012577,-0.003971,0.01045,0.010547,-0.002933,-0.009018,-0.000934,0.00264,-0.004996,0.001734
marital_married,-0.041826,-0.126864,0.02643,0.0178,-0.019908,0.031216,-0.057049,-0.444064,1.0,-0.771993,...,0.014617,-0.015779,-0.039641,0.021796,-0.010733,-0.009094,-0.003152,-0.027902,-0.017789,0.025043
marital_single,0.030588,0.133552,-0.012166,-0.020231,0.017825,-0.020303,0.061355,-0.226709,-0.771993,1.0,...,-0.024811,0.019969,0.035677,-0.031175,0.013748,0.016282,0.00409,0.028457,0.022881,-0.028452


In [170]:
df_train_dummies.columns

Index(['job', 'education', 'balance', 'housing', 'duration', 'campaign', 'y',
       'marital_divorced', 'marital_married', 'marital_single', 'loan_no',
       'loan_yes', 'month_apr', 'month_aug', 'month_dec', 'month_feb',
       'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown'],
      dtype='object')

# Removing Outliers

In [171]:
from sklearn.ensemble import IsolationForest

# Identify and remove outliers using Isolation Forest on train data
clf = IsolationForest(max_samples='auto', contamination=0.05, random_state=42)
clf.fit(df_train_dummies)
y_pred_train = clf.predict(df_train_dummies)
df_train_clean = df_train_dummies[y_pred_train == 1]



In [172]:
#pip install imblearn

In [173]:
df_train_dummies.head()

Unnamed: 0,job,education,balance,housing,duration,campaign,y,marital_divorced,marital_married,marital_single,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,8,2,64,1,202,2,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
1,4,2,534,0,104,6,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,5,2,1477,1,75,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
3,5,3,14,0,114,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,4,0,2222,0,114,2,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


# Undersampling the Majority Class

In [174]:
from imblearn.under_sampling import RandomUnderSampler

# Undersample the majority class using RandomUnderSampler on train data
X_train = df_train_dummies.drop('y', axis=1)
y_train = df_train_dummies['y']
rus = RandomUnderSampler(random_state=42)
X_resampled_train, y_resampled_train = rus.fit_resample(X_train, y_train)

df_train_resampled = pd.concat([X_resampled_train, y_resampled_train], axis=1) #Create new DataFrame with resampled data

# Note: undersampling of test data will lead us as to introduces bias into evaluation, so no needed to undersample test data

In [175]:
train_data=df_train_resampled

# Data Preprocessing 

In [176]:
train_data, val_data = train_test_split(df_train_dummies,train_size=0.8)
y_train=train_data['y']
y_val=val_data['y']

X_train=train_data.drop('y',axis=1)
X_val=val_data.drop('y',axis=1)

In [177]:
from sklearn.preprocessing import StandardScaler    
sc= StandardScaler()    
X_train_sc= sc.fit_transform(X_train)    
X_val_sc= sc.transform(X_val) 
X_test_sc=sc.transform(df_test_dummies)

# Logistic Regression

In [178]:
#lr=LogisticRegression(random_state=42)
lr=LogisticRegression(random_state=17, C=0.0008, max_iter=5000, penalty='l2', solver='liblinear')
#lr=LogisticRegression(random_state=10,C= 0.008353664363501644, max_iter=100, penalty='l2', solver= 'liblinear')
#lr=LogisticRegression( C = 0.01, solver = 'sag', max_iter=5000,random_state=10) 
lr_model=lr.fit(X_train_sc,y_train)

val_pred=lr_model.predict(X_val_sc)

print('Logistic Regr accuracy: ',accuracy_score(y_val,val_pred),'\n')
print('confusion matrix \n',confusion_matrix(y_val,val_pred))

Logistic Regr accuracy:  0.9005898255099533 

confusion matrix 
 [[7016  161]
 [ 648  313]]


# Testing our model

In [179]:
log_test_pred=lr_model.predict(X_test_sc)

# Support Vector Machine Model

In [180]:
svm_model=SVC(random_state=10,kernel='poly',shrinking=True)
svm_model.fit(X_train_sc,y_train)
svm_val_pred=svm_model.predict(X_val_sc)
print('SVM accuracy: ',accuracy_score(y_val,svm_val_pred),'\n')
print('confusionmatrix: \n',confusion_matrix(y_val,svm_val_pred))

SVM accuracy:  0.8983779798476285 

confusionmatrix: 
 [[7032  145]
 [ 682  279]]


In [181]:
svm_test_pred=svm_model.predict(X_test_sc)

# Decision Tree Model

In [182]:
#DT_model=DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=42)
DT_model=DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, min_samples_split=4,random_state=10)
DT_model.fit(X_train,y_train)
DT_model.fit(X_train_sc,y_train)
dt_val_pred=DT_model.predict(X_val_sc)
print('decision tree accuracy: ',accuracy_score(y_val,dt_val_pred),'\n')
print('confusion matrix: \n',confusion_matrix(y_val,dt_val_pred))

decision tree accuracy:  0.9004669451953797 

confusion matrix: 
 [[7032  145]
 [ 665  296]]


In [183]:
dt_test_pred=DT_model.predict(X_test_sc)

# Random Forest Model

In [184]:
rf=RandomForestClassifier(random_state=10,criterion='entropy',max_depth=17)
#DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, min_samples_split=4, random_state=42)
rf_model=rf.fit(X_train_sc,y_train)
rf_model
rf_val_pred=rf_model.predict(X_val_sc)
print('random forest accuracy: ',accuracy_score(y_val,rf_val_pred),'\n')

random forest accuracy:  0.901327107397395 



In [185]:
rf_test_pred=rf_model.predict(X_test_sc)

In [186]:
df_subm = pd.DataFrame({
    'id': df_test['id'],
    'y': log_test_pred
})
df_subm.to_csv('lst_lg_submission.csv', index=False) # Save the DataFrame to a CSV file

# Logistic Regression with Adaboost and Random Search

In [187]:
param_distributions = {'penalty': ['l2', 'none'],
                       'C': loguniform(1e-5, 100),
                       'max_iter': [100, 1000, 10000],
                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

#lr = LogisticRegression(random_state=42)
rs_cv = RandomizedSearchCV(lr, param_distributions=param_distributions, cv=5, n_iter=1000, n_jobs=-1, random_state=42, verbose=2)
rs_cv.fit(X_train_sc, y_train)

print('Best parameters:', rs_cv.best_params_)
print('Best score:', rs_cv.best_score_)

# Create a LogisticRegression model with the best hyperparameters
lr_best = LogisticRegression(random_state=42, C=0.014249805847974575, max_iter=1000, penalty='l2', solver='liblinear')

# Create an AdaBoost classifier object with the best LogisticRegression model as the base estimator
ada_lr = AdaBoostClassifier(base_estimator=lr_best, random_state=42, n_estimators=100)
ada_lr_model = ada_lr.fit(X_train_sc, y_train)

# Print the trained model
print(ada_lr_model)

# Make predictions on the validation set
y_pred_lr = ada_lr_model.predict(X_val_sc)
# Calculate the accuracy of the predictions
lr_adb_accuracy = accuracy_score(y_val, y_pred_lr)

# Print the accuracy
print("Logistic Adaboost Accuracy:", lr_adb_accuracy)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


460 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
460 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\KIIT\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\KIIT\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\KIIT\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 78, in _check_solver
    raise ValueError("penalty='none' is no

Best parameters: {'C': 0.004576374558527471, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.9018156582939753
AdaBoostClassifier(base_estimator=LogisticRegression(C=0.014249805847974575,
                                                     max_iter=1000,
                                                     random_state=42,
                                                     solver='liblinear'),
                   n_estimators=100, random_state=42)
Logistic Adaboost Accuracy: 0.8983779798476285


In [188]:
# SVM with Adaboost and Random Search
param_distributions = {'C': loguniform(1e-5, 100),
                       'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                       'degree': np.arange(1, 6),
                       'gamma': ['scale', 'auto', 'log-uniform'],
                       'max_iter': [-1],
                       'probability': [True]}
svm = SVC(random_state=42)
rs_cv = RandomizedSearchCV(svm, param_distributions=param_distributions, cv=5, n_iter=1000, n_jobs=-1, random_state=42, verbose=2)

# SVM with Adaboost and Random Search

In [None]:
# Define the parameter space for random search
param_dist = {'C': randint(1, 100),
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'] + list(randint(1, 10).rvs(8)),
              'degree': [2, 3, 4]}

# Create an SVM model
#svm = SVC()

# Create a RandomizedSearchCV object and fit it to the data
rs = RandomizedSearchCV(svm, param_distributions=param_dist, cv=5, n_iter=20, n_jobs=-1)
rs.fit(X_train_sc, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", rs.best_params_)
print("Best score: ", rs.best_score_)

In [None]:
# Create an SVC model with the best hyperparameters
svm_best = SVC(random_state=42, C=2.362656815911559, degree=5, gamma=0.01603579156728081, kernel='rbf', max_iter=-1, probability=True)

# Create an AdaBoost classifier object with the best SVM model as the base estimator
ada_svm = AdaBoostClassifier(base_estimator=svm_best, random_state=42, n_estimators=100)
ada_svm_model = ada_svm.fit(X_train_sc, y_train)

# Print the trained model
print(ada_svm_model)

# Make predictions on the validation set
y_pred_svm = ada_svm_model.predict(X_val_sc)
# Calculate the accuracy of the predictions
svm_adb_accuracy = accuracy_score(y_val, y_pred_svm)

# Print the accuracy
print("SVM Adaboost Accuracy:", svm_adb_accuracy)

# Decision Tree with Adaboost and Random search

In [2]:
# Define the hyperparameter distribution for the Decision Tree classifier
param_distributions = {'max_depth': randint(1, 50),
                       'min_samples_split': randint(2, 10),
                       'min_samples_leaf': randint(1, 10),
                       'criterion': ['gini', 'entropy']}

# Create a Decision Tree classifier object
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, min_samples_split=4,random_state=10)

# Use RandomizedSearchCV to find the best hyperparameters
rs_cv = RandomizedSearchCV(DT_model, param_distributions=param_distributions, cv=5, n_iter=100, n_jobs=-1, random_state=42, verbose=2)
rs_cv.fit(X_train_sc, y_train)

# Print the best hyperparameters and the best score
print('Best parameters:', rs_cv.best_params_)
print('Best score:', rs_cv.best_score_)

# Create a Decision Tree model with the best hyperparameters
dt_best = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=10, min_samples_leaf=4, min_samples_split=8)

# Create an AdaBoost classifier object with the best Decision Tree model as the base estimator
ada = AdaBoostClassifier(base_estimator=dt_best, random_state=42, n_estimators=100)
adb_model = ada.fit(X_train_sc, y_train)

# Print the trained model
print(adb_model)

# Make predictions on the validation set
y_pred = adb_model.predict(X_val_sc)

# Calculate the accuracy of the predictions
dt_adb_accuracy = accuracy_score(y_val, y_pred)

# Print the accuracy
print("Decision Tree Adaboost Accuracy:", dt_adb_accuracy)

NameError: name 'randint' is not defined

# Random Forest with Adaboost and Random search

In [3]:
# Define the hyperparameter distribution for the Random Forest classifier
param_distributions = {'n_estimators': [100, 200, 500, 1000],
                       'max_depth': randint(1, 50),
                       'min_samples_split': randint(2, 10),
                       'min_samples_leaf': randint(1, 10),
                       'criterion': ['gini', 'entropy']}

# Create a Random Forest classifier object

rf = RandomForestClassifier(random_state=10,criterion='entropy',max_depth=17)

# Use RandomizedSearchCV to find the best hyperparameters
rs_cv = RandomizedSearchCV(rf, param_distributions=param_distributions, cv=5, n_iter=100, n_jobs=-1, random_state=42, verbose=2)
rs_cv.fit(X_train_sc, y_train)

# Print the best hyperparameters and the best score
print('Best parameters:', rs_cv.best_params_)
print('Best score:', rs_cv.best_score_)

# Create a Random Forest model with the best hyperparameters
rf_best = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=25, min_samples_leaf=1, min_samples_split=7, n_estimators=500)

# Create an AdaBoost classifier object with the best Random Forest model as the base estimator
ada = AdaBoostClassifier(base_estimator=rf_best, random_state=42, n_estimators=100)
adb_model = ada.fit(X_train_sc, y_train)

# Print the trained model
print(adb_model)

# Make predictions on the validation set
y_pred = adb_model.predict(X_val_sc)

# Calculate the accuracy of the predictions
rf_adb_accuracy=accuracy_score(y_val,y_pred)


NameError: name 'randint' is not defined

# Cross Validation

In [4]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
# Use cross-validation to estimate the performance of the models
cv_results_lr = cross_validate(lr_model, X_train_sc, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
cv_results_svm = cross_validate(svm_model, X_train_sc, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])

# Print the average scores of each model
print('Logistic Regression: Accuracy=%.3f, Precision=%.3f, Recall=%.3f, F1=%.3f' % 
      (cv_results_lr['test_accuracy'].mean(), cv_results_lr['test_precision'].mean(),
       cv_results_lr['test_recall'].mean(), cv_results_lr['test_f1'].mean()))
print('SVM: Accuracy=%.3f, Precision=%.3f, Recall=%.3f, F1=%.3f' % 
      (cv_results_svm['test_accuracy'].mean(), cv_results_svm['test_precision'].mean(),
       cv_results_svm['test_recall'].mean(), cv_results_svm['test_f1'].mean()))

NameError: name 'lr_model' is not defined

# NN Models

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation

# Build the neural network model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_sc, y_train, epochs=50, batch_size=32, validation_split=0.2)


In [8]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# Define a function to create the Keras model
def create_model(hidden_layers=2, units=32, activation='relu'):
    model = Sequential()
    model.add(Dense(units, input_dim=X_train.shape[1], activation=activation))
    
    for _ in range(hidden_layers-1):
        model.add(Dense(units, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create a KerasClassifier wrapper for use with scikit-learn
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define the hyperparameter grid to search over
param_grid = {
    'hidden_layers': [1, 2, 3],
    'units': [16, 32, 64],
    'activation': ['relu', 'sigmoid']
}

# Perform a grid search over the hyperparameter grid
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_result = grid.fit(X_train_sc, y_train)

# Print the best hyperparameters and corresponding accuracy
print(f'Best score: {grid_result.best_score_} using {grid_result.best_params_}')

  model = KerasClassifier(build_fn=create_model, verbose=0)


NameError: name 'X_train_sc' is not defined

In [None]:
# Make predictions on new data
predictions = model.predict(X_val_sc)# Evaluate the model
loss, accuracy = model.evaluate(X_val_sc, y_val)
print(f'Validation accuracy: {accuracy}')

In [None]:
import numpy as np

# Convert probabilities to binary labels using a threshold of 0.3
predictions_binary = np.where(predictions >= 0.3, 1, 0)
np.unique(predictions_binary)

In [None]:
cnn_pred_test=model.predict(X_test_sc)

In [None]:
cnn_pred_test=model.predict(X_test_sc)
cnn_pred_test=np.where(cnn_pred_test >= 0.3, 1, 0)

In [None]:
''''# prediction on test data
cnn_pred_test=model.predict(X_test_sc)
cnn_pred_test=np.where(cnn_pred_test >= 0.3, 1, 0)
cnn_pred_labels = np.argmax(cnn_pred_test, axis=1)
# save result
df_subm = pd.DataFrame({
    'id': df_test['id'],
    'y': cnn_pred_labels
})
df_subm.to_csv('cnn_submission.csv', index=False) # Save the DataFrame to a CSV file'''