LDA model to predict whether a customer will purchase a car or not.

In [1]:
# Step 1 - Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Step 2 - Loading the dataset
df = pd.read_csv("D:\\ishita\\college_py\\car_data.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


In [3]:
# Step 3 - Adding a categorical column for 'Purchased'
df['Purchased_status'] = df['Purchased'].map({0: 'Not Purchased', 1: 'Purchased'})
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased,Purchased_status
0,385,Male,35,20000,0,Not Purchased
1,681,Male,40,43500,0,Not Purchased
2,353,Male,49,74000,0,Not Purchased
3,895,Male,40,107500,1,Purchased
4,661,Male,25,79000,0,Not Purchased


In [4]:
# Step 4 - Checking the number of rows and columns
num_rows, num_columns = df.shape  
print(f"Number of rows: {num_rows}\nNumber of columns: {num_columns}")

Number of rows: 1000
Number of columns: 6


In [5]:
# Step 5 - Checking for missing values
df.isnull().sum()

User ID             0
Gender              0
Age                 0
AnnualSalary        0
Purchased           0
Purchased_status    0
dtype: int64

In [6]:
# Step 6 - Defining Independent Variables
features = df[['Gender', 'Age', 'AnnualSalary']]
features.head()

Unnamed: 0,Gender,Age,AnnualSalary
0,Male,35,20000
1,Male,40,43500
2,Male,49,74000
3,Male,40,107500
4,Male,25,79000


In [7]:
# Step 7 - Gender dummy variable - Salary has all text data. It needs to be converted to numbers and we will use dummy variable for that.
dummy_df = pd.get_dummies(features, columns = ["Gender"])
dummy_df.head()

Unnamed: 0,Age,AnnualSalary,Gender_Female,Gender_Male
0,35,20000,False,True
1,40,43500,False,True
2,49,74000,False,True
3,40,107500,False,True
4,25,79000,False,True


In [8]:
# Step 8 - Creating X and y dataframes
X = dummy_df
y = df['Purchased']

In [9]:
# Step 9 - Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
x_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [10]:
# Step 10 - Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=42)

In [11]:
# Step 11 - Training LDA model for making predictions
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
y_pred = lda.predict(x_test)

In [12]:
# Step 12 - Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nAccuracy:", round(accuracy * 100, 2), "%")


Confusion Matrix:
 [[158  14]
 [ 41  87]]

Accuracy: 81.67 %


In [13]:
# Step 13 - Report actual vs. predicted counts
actual_counts = y_test.value_counts().to_dict()
predicted_counts = pd.Series(y_pred).value_counts().to_dict()

print("\nNumber of actual purchased cars:", actual_counts.get(1, 0))
print("Number of actual unpurchased cars:", actual_counts.get(0, 0))
print("Number of predicted purchased cars:", predicted_counts.get(1, 0))
print("Number of predicted unpurchased cars:", predicted_counts.get(0, 0))


Number of actual purchased cars: 128
Number of actual unpurchased cars: 172
Number of predicted purchased cars: 101
Number of predicted unpurchased cars: 199
