## Importing Necsessary Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('Dataset/Student_Dataset.csv')

In [None]:
print(type(df))

## Checking the dataframe

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe().T #transposing it for better viewability

In [None]:
df.isnull().sum()  #checking for any null values

In [None]:
df.duplicated().sum()  #checking fro duplicate values

In [None]:
df['Target'].value_counts()

In [None]:
df = df[df.Target != "Enrolled"] # dropping the rows having Target column as 'Enrolled' , since we are
                                 # only concerned with graduate and dropout students

In [None]:
df

## Exploratory Analysis

In [None]:
# from ydata_profiling import ProfileReport
#
# profile = ProfileReport(df, title= 'Dropout Pandas Profiling Report', explorative=True)
# profile.to_file('Dropout_ProfileReport.html')

In [None]:
plt.figure(figsize=(7, 4))  # Optional: Set the figure size
plot1 = sns.countplot(data=df, x='Target', palette='Paired')
plt.xlabel('Student Status')
plt.ylabel('Count')
plt.title('Value Counts of Student Status')
plot1.set_xticklabels(['Dropout', 'Graduate'])
plt.show()

In [None]:
# Plotting Curricular units 1st sem (grade) with the Target
sns.boxplot(x='Target', y='Curricular units 1st sem (grade)', data=df, palette='Paired')
plt.show()

In [None]:
# Plotting Curricular units 2nd sem (grade) with the Target
sns.boxplot(x='Target', y='Curricular units 2nd sem (grade)', data=df, palette='Paired')
plt.show()

In [None]:

plt.figure(figsize=(7, 4))
plot4 = sns.barplot(x='Gender', y='Age at enrollment', hue='Target', data=df, palette='Blues')
plt.legend(loc=1)
plt.xlabel('Courses taken by the student')
plot4.set_xticklabels(['Female', 'Male'])
plt.show()

In [None]:
# Cross Tablulation Plot to check the status of students as per gender.

pd.crosstab(df.Target,df.Gender, margins=True, margins_name='Total').plot(figsize=(7,4), kind="bar", title="Gender vs Status", color=['lightblue','deepskyblue','dodgerblue'])
plt.legend(labels=['Female', 'Male', 'Total'])
plt.ylabel('Count')
plt.show()

In [None]:
# Cross Tablulation Plot to check the status of students based on their debt.

pd.crosstab(df.Target, df.Debtor).plot(figsize=(7, 4), kind="bar",
title="Debt and Status",color=['lightblue', 'deepskyblue'])
plt.legend(labels=['Not In Debt', 'In Debt'])
plt.ylabel('Count')
plt.show()


In [None]:
pd.crosstab(df['Target'],df['Scholarship holder']).plot(figsize=(7, 4), kind="bar",
title="Scholarship Holders vs Status",color=['lightblue', 'deepskyblue'])
plt.legend(labels=['Non-Holders', 'Holders'])
plt.ylabel('Count')
plt.show()

## Cleaning and Preprocessing

In [None]:
# There is no null or duplicate values, so we skip the removal of these values

In [None]:
# Label encoding the target variable with 0 and 1
df['Target'] = df['Target'].map({
    'Dropout':0,
    'Graduate':1
})

In [None]:
# Grouping all the features for finding correlations efficiently

# Demographic Data
Demo_df = df[["Marital status", "Displaced", "Age at enrollment", "Gender", "Nacionality", "International", "Target"]]

# Socio-Economic Data
SE_df = df[["Mother's qualification", "Mother's occupation","Father's qualification", "Father's occupation", "Educational special needs", "Debtor", "Tuition fees up to date", "Scholarship holder", "Target"]]

# Academic Data
Academic_df = df[['Curricular units 1st sem (credited)',
            'Curricular units 1st sem (enrolled)',
            'Curricular units 1st sem (evaluations)',
            'Curricular units 1st sem (approved)',
            'Curricular units 1st sem (grade)',
            'Curricular units 1st sem (without evaluations)',
            'Curricular units 2nd sem (credited)',
            'Curricular units 2nd sem (enrolled)',
            'Curricular units 2nd sem (evaluations)',
            'Curricular units 2nd sem (approved)',
            'Curricular units 2nd sem (grade)',
            'Curricular units 2nd sem (without evaluations)', 'Target']]

# Other Miscellaneous Data
Others_df = df[['Unemployment rate', 'Inflation rate', 'GDP', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Target']]

In [None]:
# Correlation Matrix for Demographic Data
fig, ax = plt.subplots(figsize=(7,4))
sns.heatmap(Demo_df.corr(), annot=True, cmap="magma")
plt.title("Correlation Matrix of Demographic Data")
plt.show()

In [None]:
# International has very high correlation with Nationality.
# So Nationality needs to be removed.

In [None]:
# Correlation Matrix for Socio-Economic Data
fig, ax = plt.subplots(figsize=(7,4))
sns.heatmap(SE_df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix of Socio-Economic Data")
plt.show()

In [None]:
# There is no high correlation in the above plot, so no features are marked to removed.

In [None]:
# Correlation Matrix for Academic Data
fig, ax = plt.subplots(figsize=(13,13))  #making the figsize of equal dimension to better visualise the correlations
sns.heatmap(Academic_df.corr(), annot=True, cmap="viridis", annot_kws={"size": 7.5})
plt.title("Correlation Matrix for Academic Data")
plt.show()

In [None]:
# There are some high correlations in the above plot

# Curricular units 1st sem (credited) has high correlation with 2nd sem (credited)
# Curricular units 1st sem (enrolled) has high correlation with 2nd sem (enrolled)
# Curricular units 1st sem (evaluation) (0.79 correlation with sem 2)has high correlation with 2nd sem (evaluation)
# Curricular units 1st sem (approved) has high correlation with 2nd sem (approved)
# Curricular units 1st sem (grade) has high correlation with 2nd sem (grade)

# Hence removing the whole of 1st sem academic data will be enough.

In [None]:
# Correlation Matrix for Other Miscellaneous Data
fig, ax = plt.subplots(figsize=(7,4))
sns.heatmap(Others_df.corr(), annot=True, cmap="plasma")
plt.title("Correlation Matrix for Miscellaneous Data")
plt.show()

In [None]:
# There are no high correlations here, so no features are marked to be removed.

In [None]:
Redundant_Features = ["Nacionality", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)","Curricular units 1st sem (evaluations)", "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)"]

df.drop(Redundant_Features, axis=1, inplace=True)

In [None]:
df

## Separating Features from the Target variable

In [None]:
X = df.iloc[:, :28]
Y = df.iloc[:, -1]

In [None]:
X

In [None]:
Y

## Model Training and Evaluation

In [None]:
# Importing libraries for Machine Learning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

# Importing libraries for Evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# Train-Test Splitting the Dataset
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [None]:
# Creating a Random Forest Model for training on the best hyperparameters using Grid Search
def RandomForestModel(X, Y):
# finding the best hyperparameters
    param_grid = {
    'bootstrap': [True, False],
    'n_estimators': [20, 40, 80, 100],
    'max_depth': [3, 5, 10, None],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 7],
    'criterion': ['gini', 'entropy']
    }

    rfc = RandomForestClassifier()

    clf = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 4, n_jobs = -1, verbose = 1)
    model = clf.fit(X, Y)

    return model

In [None]:
# Creating a scorer function
def scorer(X_Test, Y_Test, Model):

    Y_Pred = Model.predict(X_Test)
    print(f'The Training Accuracy Score is {accuracy_score(Y_Test, Y_Pred)}')
    print(f'Precision Score: {precision_score(Y_Test, Y_Pred)}')
    print(f'Recall Score: {recall_score(Y_Test, Y_Pred)}')
    print(f'f1 Score: {f1_score(Y_Test, Y_Pred)}')
    return  None

In [None]:
def Cross_Validation_Score(X, Y, Model):
    CV = cross_val_score(Model, X, Y, cv=10)
    return  print(f'The Cross Validation Score is : {CV.mean()}')

In [None]:
# Training the Random Forest Model
model = RandomForestModel(X_train, Y_train)

In [None]:
#Saving the model
import pickle
with open('Dropout_Prediction_Model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
# Scoring the performance
scorer(X_test, Y_test, model)

In [None]:
# Getting the Cross Validation Score
Cross_Validation_Score(X_train, Y_train, model)