This is a binary classification model built for Logistical Regression.
We are testing age versus annual income to depict two groups of spenders (high and low), 
then using a Train-Test-Split to randomly select a percentage of the data to train our
model with, and the rest of the data to test the model. After we have trained the model 
and tested it, we created evaluation statistics to score the trained model. 

In [131]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score
#this is for the training of the model 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report, r2_score
#to test accuracy
from sklearn.linear_model import LogisticRegression
#this is for binary classification


#reading data
data = pd.read_csv("C:\\Users\\nicke\\Downloads\\Mall_Customers.csv").dropna()

Splitting Data

In [134]:
#train test split for the dataset
x = data["Age"]
y = data["Annual Income (k$)"]
data["Spender Category"] = pd.cut(
    data["Spending Score (1-100)"], 
    bins=[0, 50, 100], 
    labels=["Low", "High"]
)
#for gender, it is two features. male 0 male 1, female 0, female 1
if 'Gender' in data.columns:
    data["Gender Male Num"] = data["Gender"] == "Male"
    data["Gender Female Num"] = data["Gender"] == "Female"
else:
    print("Gender column not found!")

#split age into three features
if 'Age' in data.columns:
    data['Age 1-25'] = (data['Age'] >= 1) & (data['Age'] <= 25)
    data['Age 26-50'] = (data['Age'] >= 26) & (data['Age'] <= 50)
    data['Age 51-100'] = (data['Age'] >= 51) & (data['Age'] <= 100)
else:
    print("Age column not found!")

#drop the columns not in use
data["Spender Category Num"] = data["Spender Category"].map({"Low": 0, "High": 1})

#data["Gender Male Num"] = data["Gender"] == "Male"
#data["Gender Female Num"] = data["Gender"] == "Female" #seperate to two columns 
#data = data.copy().drop(['CustomerID', 'Gender'], axis=1)
data = data[data['Annual Income (k$)'] < 130]

#train, test = train_test_split(data, train_size = 0.7)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2) #random_state = 100 is the seed

Selecting Features (training and testing)

In [176]:
#select features (training)
train_x = data[['Annual Income (k$)', 'Age']]

train_y = data[['Spender Category Num']].values.ravel()

#select features (testing)
test_x = data[['Annual Income (k$)', 'Age 1-25', 'Age 26-50', 'Age 51-100', 'Gender Male Num', 'Gender Female Num']]
test_y = data[['Spender Category Num']].values.ravel()
#this is repeated below....


#scale the data (numeric)
columns = ["Annual Income (k$)"]
data[columns] = (data[columns] - data[columns].min()) / (data[columns].max() - data[columns].min())

#scale the data
age_columns = ["Age 1-25", "Age 26-50", "Age 51-100"]
data[age_columns] = data[age_columns].astype(float) #convert to 1 or 0

#min-max scaling
data[age_columns] = (data[age_columns] - data[age_columns].min()) / (data[age_columns].max() - data[age_columns].min())

model = LogisticRegression(solver = 'liblinear')
#model can also be a different type. 

model_1 = KFold(n_splits = 20, shuffle = True, random_state = None)
#model_1.fit(train_x, train_y)

scores = cross_val_score(model, test_x, test_y, cv=model_1, scoring='accuracy')

#new
X = data[['Annual Income (k$)', 'Age 1-25', 'Age 26-50', 'Age 51-100', "Gender Male Num", "Gender Female Num"]]
y = data[['Spender Category Num']].values.ravel()
kf = KFold(n_splits=10, shuffle=True, random_state=42)

feature_weights = np.zeros((kf.get_n_splits(), X.shape[1]))  # Store weights for each fold

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #.iloc
    model = LogisticRegression(solver = 'liblinear')
    model.fit(X_train, y_train)

    feature_weights[i, :] = model.coef_
    print("Coefficients: ", model.coef_)

# Compute average weights across folds
average_weights = np.mean(feature_weights, axis=0)

# Store in a DataFrame
weights_df = pd.DataFrame({"Feature": X.columns, "Average Weight": average_weights})
print(weights_df)

#-----------------------


# Print the scores for each fold
print("Scores for each fold:", scores)

# Print the average accuracy
print("Average accuracy:", scores.mean())



Coefficients:  [[-0.04936696  0.47101288  0.1317454  -0.6079525   0.11718956 -0.12238378]]
Coefficients:  [[-0.20890901  0.55540132  0.09820327 -0.71468255  0.12416908 -0.18524703]]
Coefficients:  [[-0.14570802  0.34649288  0.18659167 -0.59017568  0.11251294 -0.16960407]]
Coefficients:  [[-0.19236843  0.56835046  0.16865154 -0.75731005  0.08589518 -0.10620323]]
Coefficients:  [[-0.02314511  0.4488639   0.1837713  -0.65534435  0.06212484 -0.08483399]]
Coefficients:  [[-5.71649704e-04  5.16447123e-01  1.26567736e-01 -6.96713291e-01
  -4.90985738e-02 -4.59985951e-03]]
Coefficients:  [[ 0.11183945  0.40629316  0.22739112 -0.75705268  0.05330065 -0.17666905]]
Coefficients:  [[-0.06780785  0.46969986  0.16097952 -0.68175801 -0.00285195 -0.04822668]]
Coefficients:  [[ 0.14948212  0.42106068  0.07187212 -0.60243388  0.04202726 -0.15152835]]
Coefficients:  [[-0.27515478  0.47009815  0.19129868 -0.65733961  0.07226648 -0.06820925]]
              Feature  Average Weight
0  Annual Income (k$)     

Evauluation of the Model

In [103]:
#evaluation of the model 
predicted = model_1.predict(test_x)
cmatrix = confusion_matrix(test_y, predicted)
TN, FP, FN, TP = cmatrix.ravel()
accuracy = (TP + TN)/(TP + TN + FP + FN)
precision = (TP)/(TP + FP)
recall = (TP)/(TP + FN)
F1 = 2 * (precision * recall)/(precision + recall)
r2 = 1
print("accuracy: ", accuracy)
print("precision: ", precision)
print("recall: ", recall)
print("F1 Score: ", F1)
print("R^2 Score: ", r2, ". This is used for Linear Regression.")
print("Confusion Matrix: ")

print(cmatrix)


#visualization
# apply normalization techniques 
'''
for column in data.columns: 
    data[column] = (data[column] - data[column].min()) / (data[column].max() - data[column].min())     
  
# view normalized data 
data.plot(kind = 'bar')
print(data)
'''
#plt.bar(['High Spenders', 'Low Spenders'], data[HighSpenders, LowSpenders]) #fix this 
#plt.title('High Low')
#plt.xlabel('Spenders')
#plt.ylabel('Amount')
#plt.show()

#print("Low Spenders: " + str(LowSpenders))
#print("High Spenders: " + str(HighSpenders))

#Computing ROC Curve

probabilities = model_1.predict_proba(test_x)[:, 1]  # Probabilities for high spending class

false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, probabilities)
roc_auc = auc(false_positive_rate, true_positive_rate)

#plot the curve
plt.figure(figsize=(8, 6))
plt.plot(false_positive_rate, true_positive_rate, color="blue", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal Line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()



#More code


#print("Prediction: " + lr.predict(data, 1))

AttributeError: 'KFold' object has no attribute 'predict'