In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the original dataset
df = pd.read_csv("dataset.csv")
print("Original shape:", df.shape)
df.head()


Original shape: (480, 17)


Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


In [3]:
# Drop columns that may not help the prediction
columns_to_drop = ['PlaceofBirth', 'SectionID', 'GradeID']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print("Remaining columns:", df.columns.tolist())
df.head()



Remaining columns: ['gender', 'NationalITy', 'StageID', 'Topic', 'Semester', 'Relation', 'raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'StudentAbsenceDays', 'Class']


Unnamed: 0,gender,NationalITy,StageID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,lowerlevel,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,lowerlevel,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,lowerlevel,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,lowerlevel,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,lowerlevel,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


In [4]:
# Label encode all categorical features
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

df.head()


Unnamed: 0,gender,NationalITy,StageID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,2,7,0,0,15,16,2,20,1,1,1,2
1,1,4,2,7,0,0,20,20,3,25,1,1,1,2
2,1,4,2,7,0,0,10,7,0,30,0,0,0,1
3,1,4,2,7,0,0,30,25,5,35,0,0,0,1
4,1,4,2,7,0,0,40,50,12,50,0,0,0,2


In [5]:
# 'Class' column: 'L' means Low (Dropout) → 1, rest → 0
# NOTE: If 'Class' is already encoded, skip this

# You can uncomment below to verify unique values:
# print(df['Class'].unique())

df['Class'] = df['Class'].apply(lambda x: 1 if x == 0 else 0)
df['Class'].value_counts()


Class
0    338
1    142
Name: count, dtype: int64

In [6]:
# Save the cleaned dataset for training
df.to_csv("cleaned_dataset.csv", index=False)
print("✅ Cleaned dataset saved as 'cleaned_dataset.csv'")


✅ Cleaned dataset saved as 'cleaned_dataset.csv'
