In [1]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import ADASYN
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# Read the CSV file
df = pd.read_csv("data.csv", sep=";")

df.head()


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
df.isnull().sum()

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [4]:
df['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(df["Target"])
df["Target"] = encoder.transform(df["Target"])

In [7]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,2
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,2


In [8]:
df['Target'].unique()

array([0, 2, 1])

In [9]:
df.shape

(4424, 37)

In [10]:
df['Target'].value_counts()

2    2209
0    1421
1     794
Name: Target, dtype: int64

In [11]:
X = df.drop('Target', axis=1)
y = df['Target']


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=24)

# Create an instance of the ADASYN class
adasyn = ADASYN(random_state=42)

# Oversample the training data
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)


In [12]:
param_grid_abc = {
    'n_estimators': [100, 200, 500, 1000],
    'learning_rate': [0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

param_grid_dtree = {
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 5, 10]
}
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'sag']
}

param_grid_xbc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}


In [13]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

In [14]:
# abc = GridSearchCV(AdaBoostClassifier(random_state=1), param_grid_abc)
# dtree = GridSearchCV(DecisionTreeClassifier(random_state=3), param_grid_dtree)
# rfc = GridSearchCV(RandomForestClassifier(random_state=4), param_grid_rfc)
# lr = GridSearchCV(LogisticRegression(random_state=2), param_grid_lr)
# xbc = GridSearchCV(XGBClassifier(tree_method='gpu_hist'), param_grid_xbc)

abc = GridSearchCV(AdaBoostClassifier(random_state=1), param_grid_abc, cv=cv)
dtree = GridSearchCV(DecisionTreeClassifier(random_state=3), param_grid_dtree, cv=cv)
rfc = GridSearchCV(RandomForestClassifier(random_state=4), param_grid_rfc, cv=cv)
lr = GridSearchCV(LogisticRegression(random_state=2), param_grid_lr, cv=cv)
xbc = GridSearchCV(XGBClassifier(tree_method='gpu_hist'), param_grid_xbc, cv=cv)

In [15]:
abc.fit(X_train_resampled, y_train_resampled)
dtree.fit(X_train_resampled, y_train_resampled)
rfc.fit(X_train_resampled, y_train_resampled)
lr.fit(X_train_resampled, y_train_resampled)
xbc.fit(X_train_resampled, y_train_resampled)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTA

In [16]:
abcb = abc.best_estimator_
dtreeb = dtree.best_estimator_
rfcb = rfc.best_estimator_
lrb = lr.best_estimator_
xbcb = xbc.best_estimator_
print('Best parameters for abc:')
print(abcb.get_params())
print('Best parameters for dtree:')
print(dtreeb.get_params())
print('Best parameters for rfc:')
print(rfcb.get_params())
print('Best parameters for lr:')
print(lrb.get_params())
print('Best parameters for xbc:')
print(xbcb.get_params())

Best parameters for abc:
{'algorithm': 'SAMME.R', 'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 0.5, 'n_estimators': 500, 'random_state': 1}
Best parameters for dtree:
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 3, 'splitter': 'best'}
Best parameters for rfc:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 4, 'verbose': 0, 'warm_start': False}
Best parameters for lr:
{'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1

In [17]:
print(classification_report(y_test, abcb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76       300
           1       0.44      0.53      0.48       154
           2       0.82      0.83      0.82       431

    accuracy                           0.74       885
   macro avg       0.69      0.69      0.69       885
weighted avg       0.75      0.74      0.74       885



In [18]:
print(classification_report(y_test, dtreeb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.62      0.72       300
           1       0.39      0.57      0.46       154
           2       0.79      0.81      0.80       431

    accuracy                           0.71       885
   macro avg       0.68      0.67      0.66       885
weighted avg       0.75      0.71      0.72       885



In [19]:
print(classification_report(y_test, rfcb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.72      0.78       300
           1       0.47      0.54      0.50       154
           2       0.82      0.86      0.84       431

    accuracy                           0.76       885
   macro avg       0.71      0.71      0.71       885
weighted avg       0.77      0.76      0.76       885



In [20]:
print(classification_report(y_test, lrb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.69      0.76       300
           1       0.44      0.61      0.51       154
           2       0.83      0.82      0.83       431

    accuracy                           0.74       885
   macro avg       0.71      0.71      0.70       885
weighted avg       0.77      0.74      0.75       885



In [21]:
print(classification_report(y_test, xbcb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78       300
           1       0.47      0.45      0.46       154
           2       0.81      0.87      0.84       431

    accuracy                           0.76       885
   macro avg       0.70      0.69      0.70       885
weighted avg       0.76      0.76      0.75       885

