## Import necessary libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

## Correcting a column error in the naming of the Dataset

In [2]:
# In the original CSV file given for the dataset, all attributes were written in one column separated by the delimiter ';'
# Purpose of this cell: Rewrite the CSV such that each attribute has its own column

import csv

# Open student dataset CSV file for reading
with open('student-por.csv', 'r') as file:
    # Create a CSV reader object
    reader = csv.reader(file, delimiter=';')
    
    # Read the header row
    header = next(reader)
    
    # Create a list to store the data in separate columns
    data_columns = [[] for _ in range(len(header))]
    
    # Read and split the data into separate columns
    for row in reader:
        for i, value in enumerate(row):
            data_columns[i].append(value)
            
# Open the CSV file for writing
with open('aligned_student_data.csv', 'w', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(header)
    
    # Write the data into separate columns
    for i in range(len(data_columns[0])):
        writer.writerow([column[i] for column in data_columns])

## Import the Dataset

In [3]:
studentData = pd.read_csv('aligned_student_data.csv')
studentData

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


## Filter out irrelevant columns

In [4]:
studentData = studentData.filter(["famsize", "Pstatus", "Medu", "Fedu", "Mjob", "Fjob", "famsup", "famrel", "G3"], axis=1)
studentData.head()

Unnamed: 0,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,famsup,famrel,G3
0,GT3,A,4,4,at_home,teacher,no,4,11
1,GT3,T,1,1,at_home,other,yes,5,11
2,LE3,T,1,1,at_home,other,no,4,12
3,GT3,T,4,2,health,services,yes,3,14
4,GT3,T,3,3,other,other,yes,4,13


In [5]:
studentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   famsize  649 non-null    object
 1   Pstatus  649 non-null    object
 2   Medu     649 non-null    int64 
 3   Fedu     649 non-null    int64 
 4   Mjob     649 non-null    object
 5   Fjob     649 non-null    object
 6   famsup   649 non-null    object
 7   famrel   649 non-null    int64 
 8   G3       649 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 45.8+ KB
