In [1]:
#Load Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Set pandas display option to show all columns
pd.set_option('display.max_columns', None)

In [2]:
#pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo #from <https://archive.ics.uci.edu/dataset/320/student+performance>

**Dataset Description:**
This dataset examines student achievement in secondary education at two Portuguese schools. It includes attributes related to student grades, demographic, social, and school-related features, collected through school reports and questionnaires. Two datasets are provided, each representing performance in distinct subjects: Mathematics (mat) and Portuguese language (por).

In the study by Cortez and Silva (2008), the datasets were modeled using binary/five-level classification and regression tasks.

**Important Note:**
The target attribute G3 (final year grade) has a strong correlation with attributes G2 (2nd period grade) and G1 (1st period grade). This is because G3 is the final year grade issued at the 3rd period, while G1 and G2 correspond to the 1st and 2nd period grades, respectively. Predicting G3 without considering G2 and G1 is more challenging but also more practical (for further details, refer to the paper).

*Source: UCI Machine Learning Repository - Student Performance Dataset*

In [3]:
# fetch dataset 
student_performance = fetch_ucirepo(id=320) 
  
# data (as pandas dataframes) 
X = student_performance.data.features 
Y = student_performance.data.targets 

{'uci_id': 320, 'name': 'Student Performance', 'repository_url': 'https://archive.ics.uci.edu/dataset/320/student+performance', 'data_url': 'https://archive.ics.uci.edu/static/public/320/data.csv', 'abstract': 'Predict student performance in secondary education (high school). ', 'area': 'Social Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 649, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': ['Sex', 'Age', 'Other', 'Education Level', 'Occupation'], 'target_col': ['G1', 'G2', 'G3'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2008, 'last_updated': 'Fri Jan 05 2024', 'dataset_doi': '10.24432/C5TG7T', 'creators': ['Paulo Cortez'], 'intro_paper': {'title': 'Using data mining to predict secondary school student performance', 'authors': 'P. Cortez, A. M. G. Silva', 'published_in': 'Proceedings of 5th Annual Future Business Technology Conference', 'year'

In [6]:
Y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   G1      649 non-null    int64
 1   G2      649 non-null    int64
 2   G3      649 non-null    int64
dtypes: int64(3)
memory usage: 15.3 KB


In [4]:
Y.head()

Unnamed: 0,G1,G2,G3
0,0,11,11
1,9,11,11
2,12,13,12
3,14,14,14
4,11,13,13


| Variable Name | Role    | Type        | Demographic       | Description                                                                                               | Units | Missing Values |
|---------------|---------|-------------|-------------------|-----------------------------------------------------------------------------------------------------------|-------|----------------|
| school        | Feature | Categorical |                   | Student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)                          |       | no             |
| sex           | Feature | Binary      | Sex               | Student's sex (binary: 'F' - female or 'M' - male)                                                         |       | no             |
| age           | Feature | Integer     | Age               | Student's age (numeric: from 15 to 22)                                                                     |       | no             |
| address       | Feature | Categorical |                   | Student's home address type (binary: 'U' - urban or 'R' - rural)                                           |       | no             |
| famsize       | Feature | Categorical | Other             | Family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)                                 |       | no             |
| Pstatus       | Feature | Categorical | Other             | Parent's cohabitation status (binary: 'T' - living together or 'A' - apart)                                |       | no             |
| Medu          | Feature | Integer     | Education Level   | Mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education, 4 - higher education) |       | no             |
| Fedu          | Feature | Integer     | Education Level   | Father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education, 4 - higher education) |       | no             |
| Mjob          | Feature | Categorical | Occupation        | Mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')                  |       | no             |
| Fjob          | Feature | Categorical | Occupation        | Father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')                  |       | no             |

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

In [5]:
X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,4
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,2
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,6
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,0
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,0


In [8]:
# Check for missing values
print("Missing values in X before imputation:\n", X.isnull().sum())
print("Missing values in Y before imputation:\n", Y.isnull().sum())

Missing values in X before imputation:
 school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
dtype: int64
Missing values in Y before imputation:
 G1    0
G2    0
G3    0
dtype: int64


In [None]:
# Impute missing values
imputer_X = SimpleImputer(strategy='mean')  # You can change the strategy to 'median', 'most_frequent', or 'constant'
imputer_Y = SimpleImputer(strategy='most_frequent')  # Assuming Y has categorical targets

X_imputed = pd.DataFrame(imputer_X.fit_transform(X), columns=X.columns)
Y_imputed = pd.DataFrame(imputer_Y.fit_transform(Y), columns=Y.columns)

In [None]:
# Check for missing values after imputation
print("Missing values in X after imputation:\n", X_imputed.isnull().sum())
print("Missing values in Y after imputation:\n", Y_imputed.isnull().sum())

In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_imputed, Y_imputed, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Convert back to DataFrame optional
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("X_train_scaled:\n", X_train_scaled.head())
print("X_test_scaled:\n", X_test_scaled.head())
print("Y_train:\n", Y_train.head())
print("Y_test:\n", Y_test.head())