LDA model for Predicting Program choice during Admissions

In [1]:
# Step 1 - Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Step 2 - Loading the dataset
df = pd.read_csv("D:\\ishita\\college_py\\hsb2.csv")
df.head()

Unnamed: 0,id,Gender,race,ses,schtyp,prog,read,write,math,science,socst
0,70,0,4,1,1,1,57,52,41,47,57
1,121,1,4,2,1,3,68,59,53,63,61
2,86,0,4,3,1,1,44,33,54,58,31
3,141,0,4,3,1,3,63,44,47,53,56
4,172,0,4,2,1,2,47,52,57,53,61


In [3]:
# Step 3 - Adding a categorical column for 'prog'
df['prog_status'] = df['prog'].map({1: 'BBA', 2: 'MBA', 3: 'B tech'})
df.head()

Unnamed: 0,id,Gender,race,ses,schtyp,prog,read,write,math,science,socst,prog_status
0,70,0,4,1,1,1,57,52,41,47,57,BBA
1,121,1,4,2,1,3,68,59,53,63,61,B tech
2,86,0,4,3,1,1,44,33,54,58,31,BBA
3,141,0,4,3,1,3,63,44,47,53,56,B tech
4,172,0,4,2,1,2,47,52,57,53,61,MBA


In [4]:
# Step 4 - Checking the number of rows and columns
num_rows, num_columns = df.shape  
print(f"Number of rows: {num_rows}\nNumber of columns: {num_columns}")

Number of rows: 200
Number of columns: 12


In [5]:
# Step 5 - Checking for missing values
df.isnull().sum()

id             0
Gender         0
race           0
ses            0
schtyp         0
prog           0
read           0
write          0
math           0
science        0
socst          0
prog_status    0
dtype: int64

In [6]:
# Step 6 - Creating X and y dataframes
X = df[['read', 'write', 'math', 'science', 'socst']]
y = df['prog']

In [7]:
# Step 7 - Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
x_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [8]:
# Step 8 - Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=42)

In [9]:
# Step 9 - Training LDA model for making predictions
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
y_pred = lda.predict(x_test)

In [10]:
# Step 10 - Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nAccuracy:", round(accuracy * 100, 2), "%")


Confusion Matrix:
 [[ 1  8  7]
 [ 0 18  9]
 [ 1  5 11]]

Accuracy: 50.0 %


In [11]:
# Step 11: Display actual vs predicted choices with program names
prog_mapping = {1: 'BBA', 2: 'MBA', 3: 'B tech'}  # Mapping dictionary

comparison_df = pd.DataFrame({
    'Actual Program': y_test.map(prog_mapping).values,   # Convert numbers to names
    'Predicted Program': pd.Series(y_pred).map(prog_mapping).values
})

print("\nActual vs. Predicted Program Choices:\n", comparison_df.head(10))


Actual vs. Predicted Program Choices:
   Actual Program Predicted Program
0            BBA               MBA
1            BBA            B tech
2            BBA            B tech
3            BBA            B tech
4         B tech               MBA
5         B tech            B tech
6         B tech            B tech
7            BBA               BBA
8            BBA               MBA
9            BBA               MBA
