# Machine Learning Model Building Steps:-

## Campus Placement Prediction

# Step-1.Data Preprocessing:


In [25]:
import pandas as pd
import numpy as np

### 1.1.Load the Dataset

In [26]:
campus_df = pd.read_csv(r'https://raw.githubusercontent.com/ArchanaInsights/Datasets/main/campus_placement.csv')
campus_df.head()

Unnamed: 0,StudentID,CGPA,Internships,Projects,Workshops/Certifications,AptitudeTestScore,SoftSkillsRating,ExtracurricularActivities,PlacementTraining,SSC_Marks,HSC_Marks,PlacementStatus
0,1,7.5,1,1,1,65,4.4,No,No,61,79,NotPlaced
1,2,8.9,0,3,2,90,4.0,Yes,Yes,78,82,Placed
2,3,7.3,1,2,2,82,4.8,Yes,No,79,80,NotPlaced
3,4,7.5,1,1,2,85,4.4,Yes,Yes,81,80,Placed
4,5,8.3,1,2,2,86,4.5,Yes,Yes,74,88,Placed


In [27]:
campus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  10000 non-null  int64  
 1   CGPA                       10000 non-null  float64
 2   Internships                10000 non-null  int64  
 3   Projects                   10000 non-null  int64  
 4   Workshops/Certifications   10000 non-null  int64  
 5   AptitudeTestScore          10000 non-null  int64  
 6   SoftSkillsRating           10000 non-null  float64
 7   ExtracurricularActivities  10000 non-null  object 
 8   PlacementTraining          10000 non-null  object 
 9   SSC_Marks                  10000 non-null  int64  
 10  HSC_Marks                  10000 non-null  int64  
 11  PlacementStatus            10000 non-null  object 
dtypes: float64(2), int64(7), object(3)
memory usage: 937.6+ KB


In [28]:
campus_df.describe()

Unnamed: 0,StudentID,CGPA,Internships,Projects,Workshops/Certifications,AptitudeTestScore,SoftSkillsRating,SSC_Marks,HSC_Marks
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,7.69801,1.0492,2.0266,1.0132,79.4499,4.32396,69.1594,74.5015
std,2886.89568,0.640131,0.665901,0.867968,0.904272,8.159997,0.411622,10.430459,8.919527
min,1.0,6.5,0.0,0.0,0.0,60.0,3.0,55.0,57.0
25%,2500.75,7.4,1.0,1.0,0.0,73.0,4.0,59.0,67.0
50%,5000.5,7.7,1.0,2.0,1.0,80.0,4.4,70.0,73.0
75%,7500.25,8.2,1.0,3.0,2.0,87.0,4.7,78.0,83.0
max,10000.0,9.1,2.0,3.0,3.0,90.0,4.8,90.0,88.0


### 1.2.Handle Missing Values

In [29]:
campus_df.isnull().sum()

StudentID                    0
CGPA                         0
Internships                  0
Projects                     0
Workshops/Certifications     0
AptitudeTestScore            0
SoftSkillsRating             0
ExtracurricularActivities    0
PlacementTraining            0
SSC_Marks                    0
HSC_Marks                    0
PlacementStatus              0
dtype: int64

- There is No null and Missing values in the dataset

### 1.3.Encode Categorical Features:

In [30]:
campus_df = pd.get_dummies(campus_df, drop_first=True)
campus_df

Unnamed: 0,StudentID,CGPA,Internships,Projects,Workshops/Certifications,AptitudeTestScore,SoftSkillsRating,SSC_Marks,HSC_Marks,ExtracurricularActivities_Yes,PlacementTraining_Yes,PlacementStatus_Placed
0,1,7.5,1,1,1,65,4.4,61,79,False,False,False
1,2,8.9,0,3,2,90,4.0,78,82,True,True,True
2,3,7.3,1,2,2,82,4.8,79,80,True,False,False
3,4,7.5,1,1,2,85,4.4,81,80,True,True,True
4,5,8.3,1,2,2,86,4.5,74,88,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7.5,1,1,2,72,3.9,85,66,True,False,False
9996,9997,7.4,0,1,0,90,4.8,84,67,False,False,True
9997,9998,8.4,1,3,0,70,4.8,79,81,True,True,True
9998,9999,8.9,0,3,2,87,4.8,71,85,True,True,True


### 1.4.Feature Selection: 

In [31]:
campus_df.columns # Target Feature (PlacementStatus_Placed)

Index(['StudentID', 'CGPA', 'Internships', 'Projects',
       'Workshops/Certifications', 'AptitudeTestScore', 'SoftSkillsRating',
       'SSC_Marks', 'HSC_Marks', 'ExtracurricularActivities_Yes',
       'PlacementTraining_Yes', 'PlacementStatus_Placed'],
      dtype='object')

In [32]:
X=campus_df.drop("PlacementStatus_Placed",axis=1)
y=campus_df["PlacementStatus_Placed"]

### 1.5.Data Splitting: 

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=45)

### 1.6.Feature Scaling: 

In [35]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2.Logistic Regression - Model Building and Evaluation:

### 2.1.Model Building: 

In [36]:
# Experiment with different values for the max_iter 
from sklearn.linear_model import LogisticRegression
LR_model=LogisticRegression(max_iter=300)
LR_model.fit(X_train_scaled,y_train)

### 2.2.Model Evaluation

In [37]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

LR_y_pred = LR_model.predict(X_test_scaled)
LR_acc = accuracy_score(y_test, LR_y_pred)
print(f"Logistic_acurracy Score : {LR_acc}")
LR_cn=confusion_matrix(y_test, LR_y_pred)
print(LR_cn)
LR_cr=classification_report(y_test, LR_y_pred)
print(LR_cr)

Logistic_acurracy Score : 0.8035
[[976 206]
 [187 631]]
              precision    recall  f1-score   support

       False       0.84      0.83      0.83      1182
        True       0.75      0.77      0.76       818

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000



# Step:3.Decision Tree - Model Building and Evaluation:

### 3.1.Model Building: Build a Decision Tree model 

In [38]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
D_Tree=DecisionTreeClassifier(max_depth=3,random_state=45)
D_Tree.fit(X_train_scaled,y_train)

### 3.2.Model Evaluation:

In [39]:
#Model Evaluation
DT_y_pred = D_Tree.predict(X_test_scaled)
DT_acc = accuracy_score(y_test, DT_y_pred)
print(f"DecisionTree_acurracy Score : {DT_acc}")
DT_cn=confusion_matrix(y_test, DT_y_pred)
print(DT_cn)
DT_cr=classification_report(y_test, DT_y_pred)
print(DT_cr)

DecisionTree_acurracy Score : 0.7795
[[898 284]
 [157 661]]
              precision    recall  f1-score   support

       False       0.85      0.76      0.80      1182
        True       0.70      0.81      0.75       818

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.79      0.78      0.78      2000



# Step:4.K-Nearest Neighbors (KNN) - Model Building and Evaluation:

### 4.1.Model Building: 

In [40]:
from sklearn.neighbors import KNeighborsClassifier

k_values = list(range(1, 11))
accuracy_scores = []

for k in k_values:
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(X_train_scaled, y_train)
    KNN_y_pred = KNN.predict(X_test_scaled)
    KNN_acc = accuracy_score(y_test, KNN_y_pred)
    accuracy_scores.append(KNN_acc)
    print(f"k={k}  Accuracy: {KNN_acc:.4f}")

# Best k value
import numpy as np
best_k = k_values[np.argmax(accuracy_scores)]
print(f"Best k value: {best_k} with Accuracy: {max(accuracy_scores):.4f}")

k=1  Accuracy: 0.7250
k=2  Accuracy: 0.7300
k=3  Accuracy: 0.7610
k=4  Accuracy: 0.7625
k=5  Accuracy: 0.7740
k=6  Accuracy: 0.7775
k=7  Accuracy: 0.7805
k=8  Accuracy: 0.7795
k=9  Accuracy: 0.7820
k=10  Accuracy: 0.7835
Best k value: 10 with Accuracy: 0.7835


### 4.2.	Model Evaluation: 

In [41]:
KNN = KNeighborsClassifier(n_neighbors=10)
KNN.fit(X_train_scaled, y_train)
KNN_y_pred = KNN.predict(X_test_scaled)
KNN_acc = accuracy_score(y_test, KNN_y_pred)
print(f"k={k} Accuracy: {KNN_acc:.4f}")

k=10 Accuracy: 0.7835


### 4.3.	Reporting

- As the value of "k" increases, the model’s accuracy generally improves.
- Lower values of "k" (like 1 or 2) may result in overfitting, while higher values help smooth out predictions.
- The optimal "k" is 10, as it gives the highest accuracy score of 0.7835.

# Step:5.	Comparison and Analysis:

### 5.1.	Compare the accuracy scores of the three models

- Logistic Regression achieved the highest accuracy at 80.35%
- KNN followed closely with 78.35%
- Decision Tree had the lowest accuracy among the three at 77.95%

### 5.2.	Discuss which model performed best and why, based on the accuracy scores and other relevant factors.

- Logistic Regression achieved the highest accuracy of 0.8035, slightly outperforming both KNN (0.7835) and Decision Tree (0.7795).
- Based on the accuracy scores, the Logistic Regression model performed the best.
- The classification report for Logistic Regression shows a good balance between precision, recall, and f1-score for both classes:

       False class: Precision = 0.84, Recall = 0.83, F1 = 0.83

       True class: Precision = 0.75, Recall = 0.77, F1 = 0.76

- This means the model performs consistently well without heavily favoring one class over the other.

### 5.3.	Reflect on the strengths and weaknesses of each model in the context of this dataset.

#### Logistic Regression  
- Strengths:     
     *Simple and easy to interpret, with clear understanding of feature influence.*    
     *Performs well on linearly separable datasets*
- Weaknesses:       
     *Assumes linear relationships between features and target, which may not fully capture complex patterns.*

#### Decision Tree (with max_depth=3)
- Strengths:       
     *Easy to visualize and interpret the decision-making process.*         
     *Handles both numerical and categorical data without the need for scaling.*      
- Weaknesses:           
     *Prone to overfitting, especially with deep trees (controlled here using max_depth=3).*     
     *Slightly lower overall accuracy (77.95%) compared to other models.*

####  K-Nearest Neighbors (k = 10)
- Strengths:        
     *Non-parametric: makes no assumptions about the underlying data distribution.*        
     *Scaling helped to reduce sensitivity to feature ranges.*
- Weaknesses:           
     *Computationally intensive on large datasets, as it stores all training data.*
     *Sensitive to noisy or irrelevant features, though this was managed by scaling the data.*