In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [107]:
df=pd.read_csv("student_data.csv")

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Student ID                   200000 non-null  int64 
 1   Student Name                 200000 non-null  object
 2   Date of Birth                200000 non-null  object
 3   Field of Study               200000 non-null  object
 4   Year of Admission            200000 non-null  int64 
 5   Expected Year of Graduation  200000 non-null  int64 
 6   Current Semester             200000 non-null  int64 
 7   Specialization               200000 non-null  object
 8   Fees                         200000 non-null  int64 
 9   Discount on Fees             200000 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 15.3+ MB


In [109]:
df.isnull().sum()

Student ID                     0
Student Name                   0
Date of Birth                  0
Field of Study                 0
Year of Admission              0
Expected Year of Graduation    0
Current Semester               0
Specialization                 0
Fees                           0
Discount on Fees               0
dtype: int64

In [110]:
df["Graduation"]=df["Current Semester"]/df["Expected Year of Graduation"]
df["Fees"]=df["Year of Admission"]/df["Fees"]
df["Semester"]=df["Current Semester"]/df["Expected Year of Graduation"]

In [111]:
df.head(5)

Unnamed: 0,Student ID,Student Name,Date of Birth,Field of Study,Year of Admission,Expected Year of Graduation,Current Semester,Specialization,Fees,Discount on Fees,Graduation,Semester
0,165527,Bryan Rogers,2006-01-19,Computer Science,2020,2017,3,Web Development,0.013019,19572,0.001487,0.001487
1,635763,James Hogan,1999-05-23,Mechanical Engineering,2020,2020,2,Machine Learning,0.012795,14760,0.00099,0.00099
2,740021,David Robinson,1997-12-02,Civil Engineering,2017,2022,1,Network Security,0.036237,5871,0.000495,0.000495
3,433076,Susan Miller,1999-10-30,Computer Science,2021,2019,1,Data Science,0.014975,17284,0.000495,0.000495
4,441628,Brittany Martin,1998-01-10,Chemical Engineering,2016,2018,1,Network Security,0.016008,14871,0.000496,0.000496


In [112]:
df["Date of Birth"]=pd.to_datetime(df["Date of Birth"])

In [113]:
df["Date of Birth"]

0        2006-01-19
1        1999-05-23
2        1997-12-02
3        1999-10-30
4        1998-01-10
            ...    
199995   2001-06-06
199996   2003-12-15
199997   2000-07-27
199998   2001-02-16
199999   2005-10-17
Name: Date of Birth, Length: 200000, dtype: datetime64[ns]

In [114]:
df["Year of Birth"]=df["Date of Birth"].dt.year
df["Month of Birth"]=df["Date of Birth"].dt.month
df["Day of Birth"]=df["Date of Birth"].dt.day
df["Weekday of Borth"]=df["Date of Birth"].dt.weekday

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   Student ID                   200000 non-null  int64         
 1   Student Name                 200000 non-null  object        
 2   Date of Birth                200000 non-null  datetime64[ns]
 3   Field of Study               200000 non-null  object        
 4   Year of Admission            200000 non-null  int64         
 5   Expected Year of Graduation  200000 non-null  int64         
 6   Current Semester             200000 non-null  int64         
 7   Specialization               200000 non-null  object        
 8   Fees                         200000 non-null  float64       
 9   Discount on Fees             200000 non-null  int64         
 10  Graduation                   200000 non-null  float64       
 11  Semester                  

In [116]:
df.drop("Date of Birth", axis=1,inplace=True)
df.drop("Student Name", axis=1,inplace=True)
df.drop("Student ID", axis=1,inplace=True)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Field of Study               200000 non-null  object 
 1   Year of Admission            200000 non-null  int64  
 2   Expected Year of Graduation  200000 non-null  int64  
 3   Current Semester             200000 non-null  int64  
 4   Specialization               200000 non-null  object 
 5   Fees                         200000 non-null  float64
 6   Discount on Fees             200000 non-null  int64  
 7   Graduation                   200000 non-null  float64
 8   Semester                     200000 non-null  float64
 9   Year of Birth                200000 non-null  int32  
 10  Month of Birth               200000 non-null  int32  
 11  Day of Birth                 200000 non-null  int32  
 12  Weekday of Borth             200000 non-null  int32  
dtyp

In [118]:
df["Field of Study"].value_counts()

Field of Study
Electrical Engineering    40305
Chemical Engineering      40020
Computer Science          39960
Mechanical Engineering    39941
Civil Engineering         39774
Name: count, dtype: int64

In [119]:
df["Specialization"].value_counts()

Specialization
Web Development            40292
Machine Learning           40142
Network Security           39932
Data Science               39870
Artificial Intelligence    39764
Name: count, dtype: int64

In [120]:
encoder=LabelEncoder()
df["Field of Study"]=encoder.fit_transform(df["Field of Study"])
df["Specialization"]=encoder.fit_transform(df["Specialization"])

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Field of Study               200000 non-null  int64  
 1   Year of Admission            200000 non-null  int64  
 2   Expected Year of Graduation  200000 non-null  int64  
 3   Current Semester             200000 non-null  int64  
 4   Specialization               200000 non-null  int64  
 5   Fees                         200000 non-null  float64
 6   Discount on Fees             200000 non-null  int64  
 7   Graduation                   200000 non-null  float64
 8   Semester                     200000 non-null  float64
 9   Year of Birth                200000 non-null  int32  
 10  Month of Birth               200000 non-null  int32  
 11  Day of Birth                 200000 non-null  int32  
 12  Weekday of Borth             200000 non-null  int32  
dtyp

In [122]:
df.astype(int)

Unnamed: 0,Field of Study,Year of Admission,Expected Year of Graduation,Current Semester,Specialization,Fees,Discount on Fees,Graduation,Semester,Year of Birth,Month of Birth,Day of Birth,Weekday of Borth
0,2,2020,2017,3,4,0,19572,0,0,2006,1,19,3
1,4,2020,2020,2,2,0,14760,0,0,1999,5,23,6
2,1,2017,2022,1,3,0,5871,0,0,1997,12,2,1
3,2,2021,2019,1,1,0,17284,0,0,1999,10,30,5
4,0,2016,2018,1,3,0,14871,0,0,1998,1,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3,2017,2021,4,1,0,4246,0,0,2001,6,6,2
199996,3,2017,2019,3,4,0,12729,0,0,2003,12,15,0
199997,0,2022,2023,1,1,0,18237,0,0,2000,7,27,3
199998,1,2017,2018,1,1,0,6946,0,0,2001,2,16,4


In [123]:
x=df.drop(columns=["Specialization"])
y=df["Specialization"]

In [124]:
x_train,x_temp,y_train, y_temp=train_test_split(x,y, test_size=0.3,random_state=42)

x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [125]:
clf=KNeighborsClassifier(n_neighbors=100)

clf.fit(x_train,y_train)

In [126]:
y_val_pred=clf.predict(x_val)

print("Accuracy:", accuracy_score(y_val,y_val_pred))

Accuracy: 0.20093333333333332


In [127]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [128]:
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)
x_val_scaled=scaler.transform(x_val)

In [129]:
x_train, x_temp,y_train ,y_temp=train_test_split(x,y,test_size=0.3,random_state=42)

x_test,x_val,y_test,x_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [130]:
 clf=KNeighborsClassifier(n_neighbors=100)

 clf.fit(x_train_scaled,y_train)

In [131]:
y_val_pred=clf.predict(x_val_scaled)

print("Accuracy:", accuracy_score(y_val,y_val_pred))

Accuracy: 0.2043
