## Import necessary libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

## Correcting a column error in the naming of the Dataset

In [4]:
# In the original CSV file given for the dataset, all attributes were written in one column separated by the delimiter ';'
# Purpose of this cell: Rewrite the CSV such that each attribute has its own column
# run ONCE !

import csv

# Open student dataset CSV file for reading
with open('student-por.csv', 'r') as file:
    # Create a CSV reader object
    reader = csv.reader(file, delimiter=';')
    
    # Read the header row
    header = next(reader)
    
    # Create a list to store the data in separate columns
    data_columns = [[] for _ in range(len(header))]
    
    # Read and split the data into separate columns
    for row in reader:
        for i, value in enumerate(row):
            data_columns[i].append(value)
            
# Open the CSV file for writing
with open('student-por.csv', 'w', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(header)
    
    # Write the data into separate columns
    for i in range(len(data_columns[0])):
        writer.writerow([column[i] for column in data_columns])

## Helper functions

In [28]:
# Determine if a student performed well
def performed_well(value):
    if value >= 15:
        return 1
    else:
        return 0

# Determine whether the student's parents are living together or apart
def cohabitation_status(value):
    if value == 'T':
        return 1
    else:
        return 0
    
# Determine whether the student's parents are employed
def employed(value):
    if value == "at_home":
        return 0
    else:
        return 1

## Appending necessary attributes

In [29]:
# Read the CSV file into a DataFrame
df = pd.read_csv("student-por.csv")

# Apply performed_well function to 'G3' column and create a new column 'Performed Well'
df['PerformedWell'] = df['G3'].apply(performed_well)

# Apply cohabitation_status function to 'Pstatus' column and create a new column 'Parents Living Together'
df['ParentsLivingTogether'] = df['Pstatus'].apply(cohabitation_status)

# Apply employed function to 'Mjob' and 'Fjob' columns and create new columns "M_employed" and "F_employed" respectively
df['M_employed'] = df['Mjob'].apply(employed)
df['F_employed'] = df['Fjob'].apply(employed)

# Save the DataFrame with the new column back to a CSV file
df.to_csv("student-por.csv", index=False)

## Import the Dataset

In [30]:
studentData = pd.read_csv('student-por.csv')
studentData

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,Walc,health,absences,G1,G2,G3,PerformedWell,ParentsLivingTogether,M_employed,F_employed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,1,3,4,0,11,11,0,0,0,1
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,1,3,2,9,11,11,0,1,0,1
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,3,6,12,13,12,0,1,0,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,1,5,0,14,14,14,0,1,1,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,2,5,0,11,13,13,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,2,5,4,10,11,10,0,1,1,1
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,1,1,4,15,15,16,1,1,1,1
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,5,6,11,12,9,0,1,1,1
647,MS,M,17,U,LE3,T,3,1,services,services,...,4,2,6,10,10,10,0,1,1,1


## Filter out irrelevant columns

In [31]:
studentData = studentData.filter(["PerformedWell", "famsize", "ParentsLivingTogether", "Medu", "Fedu", "M_employed", "F_employed", "guardian", "schoolsup", "famsup", "paid"], axis=1)
studentData

Unnamed: 0,PerformedWell,famsize,ParentsLivingTogether,Medu,Fedu,M_employed,F_employed,guardian,schoolsup,famsup,paid
0,0,GT3,0,4,4,0,1,mother,yes,no,no
1,0,GT3,1,1,1,0,1,father,no,yes,no
2,0,LE3,1,1,1,0,1,mother,yes,no,no
3,0,GT3,1,4,2,1,1,mother,no,yes,no
4,0,GT3,1,3,3,1,1,father,no,yes,no
...,...,...,...,...,...,...,...,...,...,...,...
644,0,GT3,1,2,3,1,1,mother,no,no,no
645,1,LE3,1,3,1,1,1,mother,no,yes,no
646,0,GT3,1,1,1,1,1,mother,no,no,no
647,0,LE3,1,3,1,1,1,mother,no,no,no


In [32]:
studentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   PerformedWell          649 non-null    int64 
 1   famsize                649 non-null    object
 2   ParentsLivingTogether  649 non-null    int64 
 3   Medu                   649 non-null    int64 
 4   Fedu                   649 non-null    int64 
 5   M_employed             649 non-null    int64 
 6   F_employed             649 non-null    int64 
 7   guardian               649 non-null    object
 8   schoolsup              649 non-null    object
 9   famsup                 649 non-null    object
 10  paid                   649 non-null    object
dtypes: int64(6), object(5)
memory usage: 55.9+ KB
