In [108]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read Excel file
data = pd.read_excel('bike_buyers.xlsx')

# Drop the ID column
data = data.drop('ID', axis=1)
data

Unnamed: 0,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,M,F,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,No
1,M,M,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,No
2,M,M,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,No
3,S,M,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Yes
4,S,M,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
1021,S,F,20000,0,Partial High School,Manual,No,2,0-1 Miles,Europe,32,Yes
1022,M,F,20000,2,Partial College,Manual,Yes,0,0-1 Miles,Europe,63,No
1023,M,M,10000,0,Partial College,Manual,No,1,0-1 Miles,Pacific,26,Yes
1024,S,F,20000,0,High School,Manual,No,1,5-10 Miles,Europe,31,No


In [109]:
# Seperate target variable from the features 
X = data.drop("Purchased Bike", axis = 1)
y = data["Purchased Bike"]

# Perform one hot encoding on the features
X = pd.get_dummies(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 2)
# Split the training set further into training and validation sets
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Standardize features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


# Train on Logistic Regression model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make preditions
y_pred = model.predict(X_test)

# Evualuate the model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

display("Accuracy:", accuracy)
display(report)



'Accuracy:'

0.6699029126213593

'              precision    recall  f1-score   support\n\n           0       0.65      0.76      0.70       104\n           1       0.70      0.58      0.63       102\n\n    accuracy                           0.67       206\n   macro avg       0.67      0.67      0.67       206\nweighted avg       0.67      0.67      0.67       206\n'

In [110]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

#create pipline with model selection
pipeline = Pipeline([('model', None)])

# Define hyperparameter grid for a model (e.g., Logistic Regression)
param_grid =[ 
    {
        'model' : [LogisticRegression()],
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Adjust the regularization parameter
        'model__solver': ['liblinear', 'lbfgs', 'newton-cg']  # Different solvers
},
{
        'model': [RandomForestClassifier()],
        'model__n_estimators': [10, 50, 100, 200],
        'model__max_depth': [None, 10, 20, 30]
    },
    {
        'model' : [XGBClassifier()],
        'model__learning_rate' : [0.01, 0.1, 0.2],
        'model__max_depth' : [ 3, 4 , 5], 
        'model__n_estimators' : [50, 100, 200]
    },

    {
        'model': [DecisionTreeClassifier()],  
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }
]



# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to your training data
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
validation_accuracy = best_model.score(X_validation, y_validation)

# Evaluate the best model on the test set
test_accuracy = best_model.score(X_test, y_test)

print("Best Model Validation Accuracy:", validation_accuracy)
print("Best Model Test Accuracy:", test_accuracy)


Best Model Validation Accuracy: 0.524390243902439
Best Model Test Accuracy: 0.6747572815533981


