# Data Preparation


In [5]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os

In [6]:
# Load environment variables from .env file
load_dotenv()

# Get directory paths from environment variables
NOTEBOOKS_DIR_PATH = os.getenv('NOOTEBOOKS_DIR_PATH')
DATASETS_DIR_PATH = os.getenv('DATASETS_DIR_PATH')
REPORTS_DIR_PATH = os.getenv('REPORTS_DIR_PATH')

# Define file paths
raw_data_path = os.path.join('..', DATASETS_DIR_PATH, 'student-data.csv')
processed_data_path = os.path.join('..', DATASETS_DIR_PATH, 'processed_data.csv')

# Read the raw data
df = pd.read_csv(raw_data_path)
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


## Data Description

The dataset contains student achievement data in secondary education from two Portuguese schools. It includes demographic, social, and school-related features.

- **Number of Instances**: 395
- **Number of Attributes**: 31 (including the target variable)
- **Missing Values**: None

**Target Variable**:

- `passed`: Did the student pass the final exam? (binary: yes or no)


In [7]:
# Function to convert non-numeric data to numeric
def numerical_data(df):
    # Mapping categorical features to numeric values
    df['school'] = df['school'].map({'GP': 0, 'MS': 1})
    df['sex'] = df['sex'].map({'M': 0, 'F': 1})
    df['address'] = df['address'].map({'U': 0, 'R': 1})
    df['famsize'] = df['famsize'].map({'LE3': 0, 'GT3': 1})
    df['Pstatus'] = df['Pstatus'].map({'T': 0, 'A': 1})
    df['Mjob'] = df['Mjob'].map({'teacher': 0, 'health': 1, 'services': 2,
                                 'at_home': 3, 'other': 4})
    df['Fjob'] = df['Fjob'].map({'teacher': 0, 'health': 1, 'services': 2,
                                 'at_home': 3, 'other': 4})
    df['reason'] = df['reason'].map({'home': 0, 'reputation': 1,
                                     'course': 2, 'other': 3})
    df['guardian'] = df['guardian'].map({'mother': 0, 'father': 1, 'other': 2})
    df['schoolsup'] = df['schoolsup'].map({'no': 0, 'yes': 1})
    df['famsup'] = df['famsup'].map({'no': 0, 'yes': 1})
    df['paid'] = df['paid'].map({'no': 0, 'yes': 1})
    df['activities'] = df['activities'].map({'no': 0, 'yes': 1})
    df['nursery'] = df['nursery'].map({'no': 0, 'yes': 1})
    df['higher'] = df['higher'].map({'no': 0, 'yes': 1})
    df['internet'] = df['internet'].map({'no': 0, 'yes': 1})
    df['romantic'] = df['romantic'].map({'no': 0, 'yes': 1})
    df['passed'] = df['passed'].map({'no': 0, 'yes': 1})
    return df


In [8]:
# Apply the numerical_data function to the DataFrame
df = numerical_data(df)

In [9]:
# Function for feature scaling
def feature_scaling(df):
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    for column in numeric_columns:
        col = df[column]
        df[column] = (col - col.min()) / (col.max() - col.min())
    return df

In [10]:
# Apply feature scaling to the DataFrame
df = feature_scaling(df)

In [11]:
# Display the processed DataFrame
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,0.0,1.0,0.428571,0.0,1.0,1.0,1.0,1.0,0.75,0.0,...,0.0,0.0,0.75,0.5,0.75,0.0,0.0,0.5,0.08,0.0
1,0.0,1.0,0.285714,0.0,1.0,0.0,0.25,0.25,0.75,1.0,...,1.0,0.0,1.0,0.5,0.5,0.0,0.0,0.5,0.053333,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.25,0.25,0.75,1.0,...,1.0,0.0,0.75,0.5,0.25,0.25,0.5,0.5,0.133333,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.5,0.25,0.5,...,1.0,1.0,0.5,0.25,0.25,0.0,0.0,1.0,0.026667,1.0
4,0.0,1.0,0.142857,0.0,1.0,0.0,0.75,0.75,1.0,1.0,...,0.0,0.0,0.75,0.5,0.25,0.0,0.25,1.0,0.053333,1.0


## Save Processed Data

We will save the processed data to a CSV file for use in subsequent notebooks.


In [12]:
# Save the processed DataFrame to a CSV file
df.to_csv(processed_data_path, index=False)
print(f"Processed data saved to {processed_data_path}")

Processed data saved to ..\datasets\processed_data.csv
