In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import shap

In [2]:
def one_hot_fun(pandas_series):    
    # Apply one-hot encoding
    one_hot = pd.get_dummies(pandas_series)
    return(one_hot)

In [3]:
from sklearn.preprocessing import LabelEncoder
# Label encoding: In this method, each category is assigned a unique numerical value. 
def Label_Encoder(pandas_series):
    le = LabelEncoder()
    label_encoded = le.fit_transform(pandas_series)
    return(label_encoded)

In [4]:
# Target encoding: In this method, each category is replaced with the average target value 
def Target_Encoder(pandas_series, Target_series):
    encoder = ce.TargetEncoder(cols=pandas_series.name)
    target_encoded = encoder.fit_transform(pandas_series, Target_series)
    return(target_encoded)

In [5]:
# Frequency encoding: In this method, each category is replaced by the frequency of that category in the dataset
def Count_Encoder(pandas_series):    
    encoder = ce.CountEncoder(cols=pandas_series.name)
    freq_encoded = encoder.fit_transform(pandas_series)
    return(freq_encoded)

In [6]:
import category_encoders as ce
# Binary encoding: In this method, each category is represented by a binary code.
# For example, "red" could be represented as 00, "green" as 01, "blue" as 10. 
def Binary_Encoded(pandas_series):
    encoder = ce.BinaryEncoder(cols=pandas_series.name)
    binary_encoded = encoder.fit_transform(pandas_series)
    return(binary_encoded)

In [7]:
df = pd.read_csv('bike_buyers.csv')
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000.0,1.0,Bachelors,Skilled Manual,Yes,0.0,0-1 Miles,Europe,42.0,No
1,24107,Married,Male,30000.0,3.0,Partial College,Clerical,Yes,1.0,0-1 Miles,Europe,43.0,No
2,14177,Married,Male,80000.0,5.0,Partial College,Professional,No,2.0,2-5 Miles,Europe,60.0,No
3,24381,Single,,70000.0,0.0,Bachelors,Professional,Yes,1.0,5-10 Miles,Pacific,41.0,Yes
4,25597,Single,Male,30000.0,0.0,Bachelors,Clerical,No,0.0,0-1 Miles,Europe,36.0,Yes


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1000 non-null   int64  
 1   Marital Status    993 non-null    object 
 2   Gender            989 non-null    object 
 3   Income            994 non-null    float64
 4   Children          992 non-null    float64
 5   Education         1000 non-null   object 
 6   Occupation        1000 non-null   object 
 7   Home Owner        996 non-null    object 
 8   Cars              991 non-null    float64
 9   Commute Distance  1000 non-null   object 
 10  Region            1000 non-null   object 
 11  Age               992 non-null    float64
 12  Purchased Bike    1000 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 101.7+ KB


The database is unbalanced

In [9]:
df['Purchased Bike'].value_counts()

No     519
Yes    481
Name: Purchased Bike, dtype: int64

## Fill NaN 

Analyzing Numerical Variables

In [10]:
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables'.format(len(numerical)))
print('The numerical variables are :', numerical)

There are 5 numerical variables
The numerical variables are : ['ID', 'Income', 'Children', 'Cars', 'Age']


In [11]:
# check missing values in numerical variables
df[numerical].isnull().sum()

ID          0
Income      6
Children    8
Cars        9
Age         8
dtype: int64

In [12]:
df['Income'].fillna(df['Income'], inplace = True)
df['Children'].fillna(df['Children'].median(), inplace = True)
df['Cars'].fillna(df['Cars'], inplace = True)
df['Age'].fillna(df['Age'].mean(), inplace = True)

Analyzing Categorical Variables

In [13]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))
print('The categorical variables are :', categorical)

There are 8 categorical variables
The categorical variables are : ['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner', 'Commute Distance', 'Region', 'Purchased Bike']


In [14]:
df[categorical].isnull().sum()

Marital Status       7
Gender              11
Education            0
Occupation           0
Home Owner           4
Commute Distance     0
Region               0
Purchased Bike       0
dtype: int64

In [15]:
for col in df.columns:
    s = df[col].value_counts(normalize=True)
    missing = df[col].isnull()
    df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)

In [16]:
df[categorical].isnull().sum()

Marital Status      0
Gender              0
Education           0
Occupation          0
Home Owner          0
Commute Distance    0
Region              0
Purchased Bike      0
dtype: int64

In [17]:
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000.0,1.0,Bachelors,Skilled Manual,Yes,0.0,0-1 Miles,Europe,42.0,No
1,24107,Married,Male,30000.0,3.0,Partial College,Clerical,Yes,1.0,0-1 Miles,Europe,43.0,No
2,14177,Married,Male,80000.0,5.0,Partial College,Professional,No,2.0,2-5 Miles,Europe,60.0,No
3,24381,Single,Female,70000.0,0.0,Bachelors,Professional,Yes,1.0,5-10 Miles,Pacific,41.0,Yes
4,25597,Single,Male,30000.0,0.0,Bachelors,Clerical,No,0.0,0-1 Miles,Europe,36.0,Yes


## one-hot encoding

In [18]:
df['Education'].value_counts()

Bachelors              306
Partial College        265
High School            179
Graduate Degree        174
Partial High School     76
Name: Education, dtype: int64

In [19]:
df['Education'].head()

0          Bachelors
1    Partial College
2    Partial College
3          Bachelors
4          Bachelors
Name: Education, dtype: object

In [20]:
df = pd.concat([df, one_hot_fun(df['Education'])], axis=1)

In [21]:
df[['Bachelors', 'Graduate Degree', 'High School', 'Partial College', 'Partial High School']].head()

Unnamed: 0,Bachelors,Graduate Degree,High School,Partial College,Partial High School
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,0,1,0
3,1,0,0,0,0
4,1,0,0,0,0


## Label Encoder

In [None]:
df['Education'].value_counts()

In [None]:
df['Education'] = Label_Encoder(df['Education'])

In [None]:
df['Education'].value_counts()_____+

## Target Encoder

In [None]:
df['Education'].value_counts()

In [None]:
df['Education'] = Target_Encoder(df['Education'], df['Income'])

In [None]:
df['Education'].value_counts()

## Count Encoder

In [None]:
df['Education'] = Count_Encoder(df['Education'])

In [None]:
df['Education'].value_counts()

## Binary Encoded

In [None]:
df = pd.concat([df, Binary_Encoded(df['Education'])], axis=1)

In [None]:
df[['Education_0','Education_1', 'Education_2']].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
# Label encoding: In this method, each category is assigned a unique numerical value. 
def Label_Encoder(pandas_series):
    le = LabelEncoder()
    label_encoded = le.fit_transform(pandas_series)
    return(label_encoded)

In [None]:
for var in categorical:
    df[var] = Label_Encoder(df[var])

In [None]:
df.describe()

In [None]:
plt.hist(df['Marital Status'], bins = 12);

Building the model

In [None]:
y = df['Purchased Bike'].copy()
X = df.drop(['Purchased Bike', 'ID'], axis=1)
class_names = ['Biker', 'No_Biker'] 

train test split using stratify=y to deal with the unbalanced dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
clf = HistGradientBoostingClassifier()
clf.fit(X_train, y_train)

In [None]:
y_predict = clf.predict(X_test)

metric scoring

In [None]:
def metrics_scoring(y_test, y_pred):
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    return precision, accuracy

In [None]:
precision, accuracy = metrics_scoring(y_test, y_predict)
precision, accuracy

In [None]:
f1 = f1_score(y_test, y_predict, average='weighted')
f1

In [None]:
def cm_display(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm).plot()

In [None]:
cm_display(y_test, y_predict)

finding best threshold

In [None]:
def roc_auc(X_test, y_test, clf, threshold):
    # define metrics
    y_pred_proba = clf.predict_proba(X_test)[::, 1]
    y_pred = [1 if x >= threshold else 0 for x in y_pred_proba]
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    # create ROC curve
    plt.plot(fpr, tpr, label="AUC=" + str(auc))
    plt.plot(fpr, tpr)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=4)
    plt.show()

In [None]:
def best_threshold(X_test, y_test):
    yhat = clf.predict_proba(X_test)  # predict probabilities
    yhat = yhat[:, 1]  # keep probabilities for the positive outcome only
    fpr, tpr, thresholds = roc_curve(y_test, yhat)  # calculate roc curves
    auc = roc_auc_score(y_test, yhat)
    print('Auc=' + str(auc))
    gmeans = np.sqrt(tpr * (1 - fpr))  # calculate the g-mean for each threshold
    ix = np.argmax(gmeans)  # locate the index of the largest g-mean
    print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
    plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')  # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.', label='Gradient Boosting Aus=' + str(auc))
    plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    # show the plot
    # pyplot.show()
    return thresholds[ix]

In [None]:
def change_threshold(X_test, y_test):
    threshold = best_threshold(X_test, y_test)
    return threshold

In [None]:
def predict_with_new_threshold(threshold, X, clf):
    y_pred_proba = clf.predict_proba(X)[::, 1]
    y_pred = [1 if x >= threshold else 0 for x in y_pred_proba]
    return y_pred

In [None]:
threshold = change_threshold(X_test, y_test)

In [None]:
def xai(model, X, class_names):
    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)
    # shap.summary_plot(shap_values, X.values, plot_type="bar", class_names=class_names, feature_names=X.columns,
    #                   max_display=15)
    shap.summary_plot(shap_values, X.values, feature_names=X.columns, max_display=11)
    # shap.dependence_plot(0, shap_values[0], X.values, feature_names=X.columns)
    row = 2
    # shap.force_plot(explainer.expected_value[0], shap_values[0][row], X.values[row], feature_names=X.columns)
    shap.plots.waterfall(shap_values[0], max_display=17)

In [None]:
explainer = shap.Explainer(clf, X)
shap_values = explainer(X)

In [None]:
shap.summary_plot(shap_values, X.values, plot_type="bar", class_names=class_names, feature_names=X.columns, max_display=15)

In [None]:
shap.summary_plot(shap_values, X.values, feature_names=X.columns, max_display=11)

In [None]:
shap.plots.waterfall(shap_values[0], max_display=17)