#### To create a machine learning model for predicting whether an individual is likely to have or use a bank account based on the given features, we can follow the following steps:

* 1.Load the necessary libraries and the dataset.
* 2.Preprocess the data by encoding categorical variables, scaling numerical variables, and handling missing values.
* 3.Split the dataset into training and testing sets.
* 4.Train the model using the training set.
* 5.Evaluate the model using the testing set.
* 6.Tune the hyperparameters of the model to improve its performance.
* 7.Save the model for future use.

In [33]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')
print('Load Libraries-Done')

Load Libraries-Done


In [34]:
# Load the dataset
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [35]:
# Preprocess the data
# Encode categorical variables
cat_cols = ['country', 'bank_account', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head',
            'marital_status', 'education_level', 'job_type']
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])


In [36]:
# Scale numerical variables
num_cols = ['year', 'household_size', 'age_of_respondent']
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

In [37]:
# Handle missing values
data = data.dropna()
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,1.208541,uniqueid_1,1,0,1,-0.358007,-0.896188,0,5,2,3,9
1,0,1.208541,uniqueid_2,0,0,0,0.539834,1.888279,0,1,4,0,4
2,0,1.208541,uniqueid_3,1,1,1,0.539834,-0.775124,1,3,3,5,9
3,0,1.208541,uniqueid_4,0,0,1,0.539834,-0.290869,0,1,2,2,3
4,0,1.208541,uniqueid_5,0,1,0,1.886596,-0.775124,1,0,3,2,5


In [38]:
# Split the dataset into training and testing sets
X = data.drop(['uniqueid', 'bank_account'], axis=1)
y = data['bank_account']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [39]:
#6. Apply algorithm 
# Models to be tested: L ogistic Regression, KNearest, SVM, Decision Tree, Random Forest, XGBoost, LightGBM

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [40]:
classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighbors" : KNeighborsClassifier(),
    "SVC" : SVC(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier()
}

In [41]:
#Compute the training score of each models

train_scores = []
test_scores = []

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    train_score = round(classifier.score(X_train, y_train),2)
    train_scores.append(train_score)
    test_score = round(classifier.score(X_test, y_test),2)
    test_scores.append(test_score)

print(train_scores)
print(test_scores)

[0.87, 0.9, 0.88, 0.98, 0.98]
[0.88, 0.88, 0.88, 0.83, 0.87]


In [42]:
from sklearn.model_selection import cross_val_score

train_cross_scores = []
test_cross_scores = []

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    train_score = cross_val_score(classifier, X_train, y_train, cv=5)
    train_cross_scores.append(round(train_score.mean(),2))
    test_score = cross_val_score(classifier, X_test, y_test, cv=5)
    test_cross_scores.append(round(test_score.mean(),2))
    
print(train_cross_scores)
print(test_cross_scores)

[0.87, 0.87, 0.87, 0.83, 0.87]
[0.88, 0.87, 0.88, 0.84, 0.87]


In [43]:
# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [44]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8649759138566167


In [45]:
# Train the model
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)

KNeighborsClassifier()

In [46]:
# Evaluate the model
y_pred = kn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8763105695664494


In [None]:
# hyperparameter tuning of the best model
log_reg = LogisticRegression() 
params = {"penalty": ['l2'], "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                  "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} 
grid_search = GridSearchCV(log_reg, params)
grid_search.fit(x_train, y_train)

grid_search.best_estimator_

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"]}

# Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                    random_state=42),
                              param_distributions=rf_grid,
                              n_iter=2,
                              cv=2,
                              verbose=True)

# Fit the RandomizedSearchCV model
rs_model.fit(X_train, y_train)

In [47]:
# hyperparameter tuning
# Define the hyperparameters to tune using GridSearch
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan']}

In [48]:
# Define the GridSearch object with 5-fold cross-validation
grid_search = GridSearchCV(kn, param_grid, cv=5)

In [50]:
# Fit the GridSearch object to the data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']})

In [51]:
# Print the best hyperparameters and the corresponding accuracy
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

Best hyperparameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best accuracy: 0.8802379649865374


In [56]:
knn_tuned = KNeighborsClassifier({'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'})

In [57]:
knn_tuned.fit(X_train, y_train)

TypeError: '>=' not supported between instances of 'dict' and 'int'

In [22]:
# Save the model
joblib.dump(kn, 'bank_account_model.pkl')


['bank_account_model.pkl']

In [27]:
# Load the test set
test_set = pd.read_csv('test.csv')

In [28]:
# Preprocess the test set
# Encode categorical variables
cat_cols = ['country', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head',
            'marital_status', 'education_level', 'job_type']
for col in cat_cols:
    le = LabelEncoder()
    test_set[col] = le.fit_transform(test_set[col])

In [29]:
# Scale numerical variables
num_cols = ['year', 'household_size', 'age_of_respondent']
test_set[num_cols] = scaler.transform(test_set[num_cols])


In [30]:
# Make predictions
X_test = test_set.drop(['uniqueid'], axis=1)
y_pred = kn.predict(X_test)

In [31]:
# Generate the submission file
submission = pd.DataFrame({'uniqueid': test_set['uniqueid'], 'bank_account': y_pred})
submission.to_csv('submission.csv', index=False)

In [32]:
submission.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056,1
1,uniqueid_6060,1
2,uniqueid_6065,0
3,uniqueid_6072,0
4,uniqueid_6073,0


In [58]:
# Make predictions
X_test = test_set.drop(['uniqueid'], axis=1)
y_pred = rf.predict(X_test)

In [59]:
# Generate the submission file
submission2 = pd.DataFrame({'uniqueid': test_set['uniqueid'], 'bank_account': y_pred})
submission2.to_csv('submission_2.csv', index=False)

In [60]:
submission2.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056,1
1,uniqueid_6060,1
2,uniqueid_6065,0
3,uniqueid_6072,0
4,uniqueid_6073,0
