In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dipam7/student-grade-prediction")

print("Path to dataset files:", path)
!ls {path}

In [None]:
import pandas as pd
data = pd.read_csv(path+"/student-mat.csv")
data

Attribute Information:

school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)

sex - student's sex (binary: 'F' - female or 'M' - male)

age - student's age (numeric: from 15 to 22)

address - student's home address type (binary: 'U' - urban or 'R' - rural)

famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)

Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)

Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)

Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)

Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')

Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')

reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')

guardian - student's guardian (nominal: 'mother', 'father' or 'other')

traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)

studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

failures - number of past class failures (numeric: n if 1<=n<3, else 4)

schoolsup - extra educational support (binary: yes or no)

famsup - family educational support (binary: yes or no)

paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

activities - extra-curricular activities (binary: yes or no)

nursery - attended nursery school (binary: yes or no)

higher - wants to take higher education (binary: yes or no)

internet - Internet access at home (binary: yes or no)

romantic - with a romantic relationship (binary: yes or no)

famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

freetime - free time after school (numeric: from 1 - very low to 5 - very high)

goout - going out with friends (numeric: from 1 - very low to 5 - very high)

Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

health - current health status (numeric: from 1 - very bad to 5 - very good)

absences - number of school absences (numeric: from 0 to 93)




In [None]:
# получим информацию о признаках
data.info()

In [None]:
# получим описательную статистику по численным признакам данных
# это датасет об учашхся, целевой признак который нам важен - это G1, G2, G3
# это средние оценки за первый, второй и итоговый периоды обучения
data.describe()

In [None]:
print("Уникальные значения нечисловых прихзнаков")
column_uniques = {}
for c in data.columns:
    col = data[c]
    if col.dtype=='object':
        column_uniques[c] = col.unique()
        print(f"{c}:\n{column_uniques[c]}")

In [None]:
# отрисуем оценки за пероды обучения
plt.title("Первый период")
plt.hist(data['G1'])
plt.show()

plt.title("Второй период")
plt.hist(data['G2'])
plt.show()

plt.title("Итоговая оценка")
plt.hist(data['G3'])
plt.show()

In [76]:
# чтоб упростить визуализацию, сделаем G_
data["G3_group"] = pd.cut(x=data['G3'], bins=[0,5,10,15,20])

In [None]:
# Сильного влияния не оказывает
# plot_normalized_countplot(data=data, x="romantic", hue='G3_group',)
sns.histplot(data, x="romantic", hue="G3_group", multiple="stack")
plt.title("Зависиммость оценок от наличия отношений")
plt.show()

# А тем, у кого нет дома интернета, невозможно вообще нормально учиться! Группа оценок смещена влево!
sns.histplot(data, x="internet", hue="G3_group", multiple="stack")
plt.title("Зависиммость оценок от наличия интернета дома")
plt.show()

# Чем больше свободного времени после учебы, тем, как правило лучше учится студент
sns.histplot(data, x="freetime", hue="G3_group", multiple="stack")
plt.title("Зависиммость оценок от наличия свободного времени после учебы")
plt.show()

# Чем больше времени потрачено на учебу, тем, как правило лучше учится студент
sns.histplot(data, x="studytime", hue="G3_group", multiple="stack")
plt.title("Зависиммость оценок от наличия свободного времени потраченного на учебу")
plt.show()



In [None]:
sns.boxplot(data=data, x='internet', y='G3')
plt.show()

#мальчики учаться чуть лучше
sns.boxplot(data=data, x='sex', y='G3')
plt.show()


In [None]:
# when mother is working in health student is doing better
sns.violinplot(data=data, x='Mjob', y='G3')
plt.title("Distribution of G3 Scores by mother job")
plt.show()

# when father is teacher student is doing better
sns.violinplot(data=data, x='Fjob', y='G3')
plt.title("Distribution of G3 Scores by father job")
plt.show()


In [None]:
cols = ['school', 'sex', 'age', 'famsize', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G3_group']
sns.pairplot(data[cols], kind="scatter", hue="G3_group")

In [122]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
def one_hot_encode(data):
    """Replaces string data with their one-hot encoded versions"""
    # Check if input is a DataFrame or Series
    if isinstance(data, pd.Series):
        # One-hot encode the Series directly
        return pd.get_dummies(data, prefix=data.name, dtype=float)
    elif isinstance(data, pd.DataFrame):
        # Make a copy of the original dataframe
        df_copy = data.copy()
        # Loop through each column
        for col in df_copy.columns:
            # Check if the column is of type object (text)
            if df_copy[col].dtype != 'object': continue
            unique_vals=df_copy[col].unique()
            
            if len(unique_vals) == 2:
                # Use OrdinalEncoder for binary categorical features
                ordinal_encoder = OrdinalEncoder()  # Create a new instance for each column
                encoded_col = ordinal_encoder.fit_transform(df_copy[col].values.reshape(-1, 1))
                df_copy[col] = encoded_col.flatten()  # Add to encoded data
            else:
                # Create one-hot encoding for the column
                dummies = pd.get_dummies(df_copy[col], prefix=col, dtype=float)
                # Insert the dummies columns right after the original column
                df_copy = pd.concat([df_copy.iloc[:, :df_copy.columns.get_loc(col) + 1], dummies, df_copy.iloc[:, df_copy.columns.get_loc(col) + 1:]], axis=1)
                # Drop the original column
                df_copy = df_copy.drop(col, axis=1)
        return df_copy
    else:
        raise ValueError("Input must be a pandas DataFrame or Series")

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd

print("Заменим нечисловые признаки")
encoded_data = one_hot_encode(data)
encoded_data

In [None]:
sns.heatmap(encoded_data.corr(numeric_only = True))