In [None]:
import numpy as np
import pandas as pd

In [None]:
dataset_path = '../../datasets/student_performance.csv'

In [None]:
df = pd.read_csv(dataset_path, delimiter=',')
df.head()

In [None]:
def two_str_value_to_int(df:pd.DataFrame, columns:list):
    for column in columns:
        # unique_values = list(df[column].unique())
        if 'yes' in list(df[column].unique()):
            unique_values = ['no', 'yes']
        # if len(unique_values) == 2:
            df[column] = df[column].apply(lambda x: unique_values.index(str(x).lower()))
    return df


In [None]:
str_columns = [
    "school", "sex", "address", "famsize", "Pstatus", "Mjob", "Fjob", 
    "reason", "guardian", "schoolsup", "famsup", "paid", "activities", 
    "nursery", "higher", "internet", "romantic"
]

df = two_str_value_to_int(df, str_columns)
df[str_columns].head()

In [None]:
for column in str_columns:
    print(f'column = {column} || unique values = {list(df[column].unique())}')


In [None]:
df = pd.get_dummies(df)

In [None]:
before = len(df)
df = df.drop_duplicates()
after = len(df)

print(f'Count before drop duplicates: {before}')
print(f'Count after drop duplicates: {after}')

| Nota (0 a 10) | Nota GPA (Escala de 4.0) | Nota Letra |
|---------------------|--------------------------|------------|
| 9.0 a 10.0          | 4.0                      | A          |
| 8.0 a 8.9           | 3.0 - 3.9                | B          |
| 7.0 a 7.9           | 2.0 - 2.9                | C          |
| 6.0 a 6.9           | 1.0 - 1.9                | D          |
| 0 a 5.9             | 0.0                      | F          |


In [None]:
df['G2'].unique()

In [None]:
df['GPA_result'] = df['G2'].apply(lambda x: 'PASS' if int(x)/5 >= 2 else 'FAIL')

In [None]:
# df = df.drop(columns=['G1', 'G2', 'G3'])

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
columns = []
for c in df.columns:
    if c.startswith('Fjob'):
        columns.append(c)

df[columns].head()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

def logs(model, y_test, X_test):
    # Predição
    y_pred = model.predict(X_test)

    # Matriz de confusão
    print('Matriz de Confusão')
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_test.unique())
    disp.plot()

    # Acurácia
    acc = accuracy_score(y_test, y_pred)
    print("Acurácia: {:.2f}".format(acc))

    # Precisão média ponderada
    prem = precision_score(y_true = y_test, y_pred = y_pred, average = "weighted")
    print("Precisão média ponderada: {:.2f}".format(prem))

    # Recall médio ponderado
    recm = recall_score(y_true = y_test, y_pred = y_pred, average = "weighted")
    print("Recall médio ponderado: {:.2f}".format(recm))

    # F1 
    f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 0)
    print("F1 (FAIL): {:.2f}".format(f1))
    # F1 
    f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 1)
    print("F1 (PASS): {:.2f}".format(f1))

    # F1 médio ponderado
    f1m = f1_score(y_true = y_test, y_pred = y_pred, average = "weighted")
    print("F1 médio ponderado: {:.2f}".format(f1m))



In [None]:
columns_to_X = [
    'age', # student's age (numeric: from 15 to 22)
    'Medu', # mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
    'Fedu', # father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
    'traveltime', # home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
    'studytime', # weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
    'failures', # number of past class failures (numeric: n if 1<=n<3, else 4)
    'schoolsup', # extra educational support (binary: yes or no)
    'famsup', # family educational support (binary: yes or no)
    'paid', # extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
    'activities', # extra-curricular activities (binary: yes or no)
    'nursery', # attended nursery school (binary: yes or no)
    'higher', # wants to take higher education (binary: yes or no)
    'internet', # Internet access at home (binary: yes or no)
    'romantic', # with a romantic relationship (binary: yes or no)
    'famrel', # quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
    'freetime', # free time after school (numeric: from 1 - very low to 5 - very high)
    'goout', # going out with friends (numeric: from 1 - very low to 5 - very high)
    # 'Dalc', # workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
    # 'Walc', # weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
    'health', # current health status (numeric: from 1 - very bad to 5 - very good)
    'absences', # number of school absences (numeric: from 0 to 93)
    # 'G1', # first period grade (numeric: from 0 to 20)
    # 'G2', # second period grade (numeric: from 0 to 20)
    # 'G3', # final grade (numeric: from 0 to 20, output target)
    'school_GP', # student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
    'school_MS', # student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
    'sex_F', # student's sex (binary: 'F' - female or 'M' - male)
    'sex_M', # student's sex (binary: 'F' - female or 'M' - male)
    'address_R', # student\'s home address type (binary: 'U' - urban or 'R' - rural)
    'address_U', # student\'s home address type (binary: 'U' - urban or 'R' - rural)
    'famsize_GT3', # family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
    'famsize_LE3', # family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
    'Pstatus_A', # parent\'s cohabitation status (binary: 'T' - living together or 'A' - apart)
    'Pstatus_T', # parent\'s cohabitation status (binary: 'T' - living together or 'A' - apart)
    'Mjob_at_home', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Mjob_health', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Mjob_other', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Mjob_services', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Mjob_teacher', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Fjob_at_home', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Fjob_health', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Fjob_other', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Fjob_services', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'Fjob_teacher', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'reason_course', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_home', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_other', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_reputation', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'guardian_father', # student\'s guardian (nominal: 'mother', 'father' or 'other')
    'guardian_mother', # student\'s guardian (nominal: 'mother', 'father' or 'other')
    'guardian_other', # student\'s guardian (nominal: 'mother', 'father' or 'other')
]


In [None]:
X = df[columns_to_X]
y = df['GPA_result'].map({'PASS':1, 'FAIL':0})
# y = y.map({'PASS':1, 'FAIL':0})

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.neural_network import MLPClassifier

# Instancia um classificador tipo rede neural com, no máximo, 2000 épocas
model = MLPClassifier(
    hidden_layer_sizes=(200, 100, 50),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    # power_t=0.5,
    max_iter=5000,
    shuffle=True,
    random_state=1,
    # tol=0.0001,
    # verbose=False,
    # warm_start=False,
    # momentum=0.9,
    # nesterovs_momentum=True,
    # early_stopping=False,
    # validation_fraction=0.1,
    # beta_1=0.9,
    # beta_2=0.999,
    # epsilon=1e-8,
    # n_iter_no_change=10,
    max_fun=15000
)

model.fit(X_train, y_train)


In [None]:
logs(model=model, y_test=y_test, X_test=X_test)

In [None]:
from sklearn.neural_network import MLPClassifier

# Instancia um classificador tipo rede neural com, no máximo, 2000 épocas
model = MLPClassifier(
    hidden_layer_sizes=(84, 42, 21),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    learning_rate_init=0.001,
    # power_t=0.5,
    max_iter=5000,
    shuffle=True,
    random_state=1,
    # tol=0.0001,
    # verbose=False,
    # warm_start=False,
    # momentum=0.9,
    # nesterovs_momentum=True,
    # early_stopping=False,
    # validation_fraction=0.1,
    # beta_1=0.9,
    # beta_2=0.999,
    # epsilon=1e-8,
    # n_iter_no_change=10,
    max_fun=10000
)

model.fit(X_train, y_train)


In [None]:
logs(model=model, y_test=y_test, X_test=X_test)

In [None]:
columns_to_X = [
    'age', # student's age (numeric: from 15 to 22)
    'Medu', # mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
    'Fedu', # father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
    'traveltime', # home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
    'studytime', # weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
    'failures', # number of past class failures (numeric: n if 1<=n<3, else 4)
    'schoolsup', # extra educational support (binary: yes or no)
    'famsup', # family educational support (binary: yes or no)
    'paid', # extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
    'activities', # extra-curricular activities (binary: yes or no)
    'nursery', # attended nursery school (binary: yes or no)
    'higher', # wants to take higher education (binary: yes or no)
    'internet', # Internet access at home (binary: yes or no)
    'romantic', # with a romantic relationship (binary: yes or no)
    'famrel', # quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
    'freetime', # free time after school (numeric: from 1 - very low to 5 - very high)
    'goout', # going out with friends (numeric: from 1 - very low to 5 - very high)
    # 'Dalc', # workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
    # 'Walc', # weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
    'health', # current health status (numeric: from 1 - very bad to 5 - very good)
    'absences', # number of school absences (numeric: from 0 to 93)
    # 'G1', # first period grade (numeric: from 0 to 20)
    # 'G2', # second period grade (numeric: from 0 to 20)
    # 'G3', # final grade (numeric: from 0 to 20, output target)
    'school_GP', # student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
    'school_MS', # student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
    # 'sex_F', # student's sex (binary: 'F' - female or 'M' - male)
    # 'sex_M', # student's sex (binary: 'F' - female or 'M' - male)
    # 'address_R', # student\'s home address type (binary: 'U' - urban or 'R' - rural)
    # 'address_U', # student\'s home address type (binary: 'U' - urban or 'R' - rural)
    # 'famsize_GT3', # family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
    # 'famsize_LE3', # family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
    'Pstatus_A', # parent\'s cohabitation status (binary: 'T' - living together or 'A' - apart)
    'Pstatus_T', # parent\'s cohabitation status (binary: 'T' - living together or 'A' - apart)
    # 'Mjob_at_home', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Mjob_health', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Mjob_other', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Mjob_services', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Mjob_teacher', # mother\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Fjob_at_home', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Fjob_health', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Fjob_other', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Fjob_services', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    # 'Fjob_teacher', # father\'s job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
    'reason_course', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_home', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_other', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    'reason_reputation', # reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
    # 'guardian_father', # student\'s guardian (nominal: 'mother', 'father' or 'other')
    # 'guardian_mother', # student\'s guardian (nominal: 'mother', 'father' or 'other')
    # 'guardian_other', # student\'s guardian (nominal: 'mother', 'father' or 'other')
]


In [None]:
X = df[columns_to_X]
y = df['GPA_result'].map({'PASS':1, 'FAIL':0})
# y = y.map({'PASS':1, 'FAIL':0})

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.neural_network import MLPClassifier

# Instancia um classificador tipo rede neural com, no máximo, 2000 épocas
model = MLPClassifier(
    hidden_layer_sizes=(64, 32, 16),
    # activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    # power_t=0.5,
    max_iter=5000,
    shuffle=True,
    random_state=1,
    # tol=0.0001,
    # verbose=False,
    # warm_start=False,
    # momentum=0.9,
    # nesterovs_momentum=True,
    # early_stopping=False,
    # validation_fraction=0.1,
    # beta_1=0.9,
    # beta_2=0.999,
    # epsilon=1e-8,
    # n_iter_no_change=10,
    max_fun=15000
)

model.fit(X_train, y_train)


In [None]:
logs(model=model, y_test=y_test, X_test=X_test)

In [None]:
print("""
** Melhor versão do Modelo.

Matriz de Confusão
Acurácia: 0.82
Precisão média ponderada: 0.80
Recall médio ponderado: 0.82
F1 (FAIL): 0.52
F1 (PASS): 0.89
F1 médio ponderado: 0.80
""")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# # Predição
# y_pred = model.predict(X_test)

# # Matriz de confusão
# print('Matriz de Confusão')
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_test.unique())
# disp.plot()

# # Acurácia
# acc = accuracy_score(y_test, y_pred)
# print("Acurácia: {:.2f}".format(acc))

# # Precisão média ponderada
# prem = precision_score(y_true = y_test, y_pred = y_pred, average = "weighted")
# print("Precisão média ponderada: {:.2f}".format(prem))

# # Recall médio ponderado
# recm = recall_score(y_true = y_test, y_pred = y_pred, average = "weighted")
# print("Recall médio ponderado: {:.2f}".format(recm))

# # F1 
# f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 0)
# print("F1 (FAIL): {:.2f}".format(f1))
# # F1 
# f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 1)
# print("F1 (PASS): {:.2f}".format(f1))

# # F1 médio ponderado
# f1m = f1_score(y_true = y_test, y_pred = y_pred, average = "weighted")
# print("F1 médio ponderado: {:.2f}".format(f1m))


In [None]:
# import joblib

In [None]:
# joblib.dump(model, 'house_price_linear_regression_model.pkl')