<a href="https://colab.research.google.com/github/Joyjit-22/agent/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
file_path = "bike_buyers_cleaned.csv"
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000.0,1.0,Bachelors,Skilled Manual,Yes,0.0,0-1 Miles,Europe,42.0,No
1,24107,Married,Male,30000.0,3.0,Partial College,Clerical,Yes,1.0,0-1 Miles,Europe,43.0,No
2,14177,Married,Male,80000.0,5.0,Partial College,Professional,No,2.0,2-5 Miles,Europe,60.0,No
3,25597,Single,Male,30000.0,0.0,Bachelors,Clerical,No,0.0,0-1 Miles,Europe,36.0,Yes
4,13507,Married,Female,10000.0,2.0,Partial College,Manual,Yes,0.0,1-2 Miles,Europe,50.0,No


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                952 non-null    int64  
 1   Marital Status    952 non-null    object 
 2   Gender            952 non-null    object 
 3   Income            952 non-null    float64
 4   Children          952 non-null    float64
 5   Education         952 non-null    object 
 6   Occupation        952 non-null    object 
 7   Home Owner        952 non-null    object 
 8   Cars              952 non-null    float64
 9   Commute Distance  952 non-null    object 
 10  Region            952 non-null    object 
 11  Age               952 non-null    float64
 12  Purchased Bike    952 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 96.8+ KB


In [6]:
df.drop(columns=['ID'], inplace=True)

In [8]:
for i in df.columns:
  print(i)
  print(df[i].unique())

Marital Status
['Married' 'Single']
Gender
['Female' 'Male']
Income
[ 40000.  30000.  80000.  10000.  90000.  60000.  20000.  70000. 170000.
 130000. 120000. 150000. 100000.  50000. 110000. 160000.]
Children
[1. 3. 5. 0. 2. 4.]
Education
['Bachelors' 'Partial College' 'High School' 'Partial High School'
 'Graduate Degree']
Occupation
['Skilled Manual' 'Clerical' 'Professional' 'Manual' 'Management']
Home Owner
['Yes' 'No']
Cars
[0. 1. 2. 4. 3.]
Commute Distance
['0-1 Miles' '2-5 Miles' '1-2 Miles' '10+ Miles' '5-10 Miles']
Region
['Europe' 'Pacific' 'North America']
Age
[42. 43. 60. 36. 50. 54. 35. 45. 38. 59. 47. 55. 56. 34. 63. 29. 44. 32.
 26. 31. 62. 41. 30. 28. 40. 65. 48. 66. 46. 52. 61. 37. 68. 33. 51. 49.
 53. 39. 27. 25. 58. 67. 57. 70. 78. 69. 64. 89. 80. 73. 71. 72.]
Purchased Bike
['No' 'Yes']


In [9]:
num_cols = ['Income', 'Children', 'Cars', 'Age']
cat_cols = ['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner', 'Commute Distance', 'Region']

In [10]:
df['Purchased Bike'] = df['Purchased Bike'].map({'Yes': 1, 'No': 0})

In [12]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cats = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(cat_cols))

scaler = StandardScaler()
scaled_nums = scaler.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled_nums, columns=num_cols)

In [13]:
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df['Purchased Bike']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# logistic regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
log_acc = accuracy_score(y_test, y_pred_log)

In [16]:
# random forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)

In [17]:
# svm
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, y_pred_svm)

In [26]:
# xgboost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
gxgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_xgb = grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}


Parameters: { "use_label_encoder" } are not used.



In [28]:
accuracy_results = {
    "Logistic Regression": log_acc,
    "Random Forest": rf_acc,
    "SVM": svm_acc,
    "XGBoost": xgb_acc
}
print("\nModel Accuracies:")
for model, acc in accuracy_results.items():
    print(f"{model}: {acc:.4f}")


Model Accuracies:
Logistic Regression: 0.6440
Random Forest: 0.7173
SVM: 0.7120
XGBoost: 0.7277


In [33]:
feature_names = X.columns
xgb_feature_importances = xgb_model.feature_importances_
xgb_weights_df = pd.DataFrame({"Feature": feature_names, "Importance": xgb_feature_importances})
xgb_weights_df.sort_values(by="Importance", ascending=False, inplace=True)
print("\nFeature Importances (Descending Order) - XGBoost:")
print(xgb_weights_df.to_string(index=False))


Feature Importances (Descending Order) - XGBoost:
                      Feature  Importance
               Region_Pacific    0.098464
                         Cars    0.079164
    Education_Graduate Degree    0.058457
   Commute Distance_10+ Miles    0.058417
   Commute Distance_2-5 Miles    0.055989
        Education_High School    0.051167
    Occupation_Skilled Manual    0.050783
   Commute Distance_1-2 Miles    0.049049
                     Children    0.046498
        Marital Status_Single    0.045794
                          Age    0.045745
  Commute Distance_5-10 Miles    0.044385
                       Income    0.042093
Education_Partial High School    0.039389
      Occupation_Professional    0.039297
         Region_North America    0.038941
            Occupation_Manual    0.038546
               Home Owner_Yes    0.036691
    Education_Partial College    0.033972
                  Gender_Male    0.029211
        Occupation_Management    0.017948


In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [32]:
# deep learning model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
nn_acc = accuracy_score(y_test, y_pred_nn)

print(f"\nDeep Learning Model Accuracy: {nn_acc:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.4936 - loss: 0.7077 - val_accuracy: 0.6021 - val_loss: 0.6801
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5703 - loss: 0.6772 - val_accuracy: 0.6440 - val_loss: 0.6725
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6400 - loss: 0.6606 - val_accuracy: 0.6073 - val_loss: 0.6592
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6666 - loss: 0.6332 - val_accuracy: 0.6440 - val_loss: 0.6475
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6720 - loss: 0.6344 - val_accuracy: 0.6702 - val_loss: 0.6386
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6673 - loss: 0.6338 - val_accuracy: 0.6754 - val_loss: 0.6370
Epoch 7/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━

In [35]:
model.summary()