## DAIE Project Part 1: Data Acquisition and Data Exploration


### 1. Data Acquisition and Data Exploration

#### 1.1. Preparing the tools by importing libraries
- (Add code here to load your dataset.)

In [21]:
# 1.1 Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

#### 1.2. Loading the data
- (Add code here to load your dataset.)

In [22]:
# Load Dataset
data = pd.read_csv('./data/student-merge(uncleaned).csv')

In [23]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,15,U,GT3,A,3,3,other,health,...,4,3,3,1,1,4,10.0,10,10,10
1,GP,F,16,U,LE3,T,2,2,other,at_home,...,4,3,3,2,2,5,14.0,10,11,11
2,MS,F,16,U,GT3,T,1,2,other,services,...,1,3,2,1,2,4,3.0,9,8,8
3,GP,M,17,R,LE3,T,1,1,other,services,...,5,3,5,1,5,5,,8,8,8
4,GP,M,15,U,GT3,T,3,4,services,services,...,5,5,5,3,2,5,0.0,13,13,12


#### 1.3. Clean the data for modeling
- (Add code here to clean and preprocess your dataset.)


In [24]:
# Check for missing values
print(data.isnull().sum())

school                0
sex                   0
age                   0
address             171
famsize               0
Pstatus               0
Medu                  0
Fedu                  0
Mjob                  0
Fjob                  0
reason                0
friendship_scale    967
guardian            172
traveltime            0
studytime             0
failures              0
schoolsup             0
famsup                0
paid                  0
activities            0
nursery               0
higher                0
internet              0
romantic            129
famrel                0
freetime              0
goout                 0
Dalc                  0
Walc                  0
health                0
absences            171
G1                    0
G2                    0
G3                    0
dtype: int64


In [25]:
# Check for missing values in each column
missing_values = data.isnull().sum()

# Display columns with missing values only
missing_values[missing_values > 0]

address             171
friendship_scale    967
guardian            172
romantic            129
absences            171
dtype: int64

In [26]:
data_cleaned = data.copy()
data['address'] = data['address'].fillna(data['address'].mode()[0])
data['guardian'] = data['guardian'].fillna(data['guardian'].mode()[0])
data['romantic'] = data['romantic'].fillna(data['romantic'].mode()[0])
data['absences'] = data['absences'].fillna(data['absences'].median)
data['friendship_scale'] = data['friendship_scale'].fillna(data['friendship_scale'].median)

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   school            1074 non-null   object
 1   sex               1074 non-null   object
 2   age               1074 non-null   int64 
 3   address           1074 non-null   object
 4   famsize           1074 non-null   object
 5   Pstatus           1074 non-null   object
 6   Medu              1074 non-null   int64 
 7   Fedu              1074 non-null   int64 
 8   Mjob              1074 non-null   object
 9   Fjob              1074 non-null   object
 10  reason            1074 non-null   object
 11  friendship_scale  1074 non-null   object
 12  guardian          1074 non-null   object
 13  traveltime        1074 non-null   int64 
 14  studytime         1074 non-null   int64 
 15  failures          1074 non-null   int64 
 16  schoolsup         1074 non-null   object
 17  famsup        

In [28]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,15,U,GT3,A,3,3,other,health,...,4,3,3,1,1,4,10.0,10,10,10
1,GP,F,16,U,LE3,T,2,2,other,at_home,...,4,3,3,2,2,5,14.0,10,11,11
2,MS,F,16,U,GT3,T,1,2,other,services,...,1,3,2,1,2,4,3.0,9,8,8
3,GP,M,17,R,LE3,T,1,1,other,services,...,5,3,5,1,5,5,<bound method Series.median of 0 10.0\n1...,8,8,8
4,GP,M,15,U,GT3,T,3,4,services,services,...,5,5,5,3,2,5,0.0,13,13,12


In [29]:
# Step 2: Encode categorical variables
# Identifying categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding to categorical columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Define numerical columns to scale (excluding target variable 'G3')
numerical_cols = ['age', 'G1', 'G2']

# Initialize the scaler
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [30]:
data.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,absences_30.0,absences_24.0,absences_17.0,absences_15.0,absences_40.0,absences_20.0,absences_25.0,absences_28.0,absences_23.0,absences_26.0
0,-1.390664,3,3,1,4,0,4,3,3,1,...,False,False,False,False,False,False,False,False,False,False
1,-0.582454,2,2,2,2,1,4,3,3,2,...,False,False,False,False,False,False,False,False,False,False
2,-0.582454,1,2,1,3,1,1,3,2,1,...,False,False,False,False,False,False,False,False,False,False
3,0.225757,1,1,4,2,0,5,3,5,1,...,False,False,False,False,False,False,False,False,False,False
4,-1.390664,3,4,1,1,0,5,5,5,3,...,False,False,False,False,False,False,False,False,False,False


### 2. Modeling

#### 2.1. Train, test, and split
- (Add code here to split the data into training and testing sets.)

In [31]:
# Define the feature matrix X and the target variable y (assuming G3 is the target variable)
target = 'G3'
X = data.drop(columns=[target])  # Drop target variable from feature set
y = data[target]               # Define the target variable

In [32]:
# Split the dataset into training and testing sets with a 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 2.2. Model choices
- (Add code here to define different models.)

In [33]:
# Define the models
linear_reg = LinearRegression()
decision_tree = DecisionTreeRegressor(random_state=42)
random_forest = RandomForestRegressor(random_state=42)

# Store the models in a dictionary for easy iteration later
models = {
    'Linear Regression': linear_reg,
    'Decision Tree': decision_tree,
    'Random Forest': random_forest,
}

# Display the model names to confirm
list(models.keys())

['Linear Regression', 'Decision Tree', 'Random Forest']

#### 2.3. Model comparison
- (Add code here to compare the performance of different models.)

In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Dictionary to store model performance results
model_performance = {}

# Select the models to compare
models_to_compare = {
    'Linear Regression': models['Linear Regression'],
    'Decision Tree': models['Decision Tree'],
    'Random Forest': models['Random Forest']
}

# Train and evaluate each selected model
for name, model in models_to_compare.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the performance in the dictionary
    model_performance[name] = {
        'MAE': mae,
        'MSE': mse,
        'R2 Score': r2
    }

# Convert the performance dictionary to a DataFrame for easy viewing
performance_df = pd.DataFrame(model_performance).T
performance_df

Unnamed: 0,MAE,MSE,R2 Score
Linear Regression,2564128000.0,9.655651e+20,-5.849138e+19
Decision Tree,0.972093,3.167442,0.8081248
Random Forest,0.7688372,1.505353,0.9088097


#### 2.4. Hyperparameter tuning
- (Add code here for tuning hyperparameters of your model.)

In [35]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

rf_model = RandomForestRegressor(
    max_depth=20,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=100
)


dt_model = DecisionTreeRegressor(
    max_depth=20,
    min_samples_leaf=4,
    min_samples_split=2
)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


#### 2.5. Cross-validation
- (Add code here to implement cross-validation.)

In [36]:
cross_val_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_squared_error')
mean_cv_score = -cross_val_scores.mean()

print("Cross-Validation MSE (Random Forest):", mean_cv_score)


cross_val_scores = cross_val_score(dt_model, X, y, cv=5, scoring='neg_mean_squared_error')
mean_cv_score = -cross_val_scores.mean()

print("Cross-Validation MSE (Decision Tree):", mean_cv_score)

cross_val_scores = cross_val_score(linear_reg, X, y, cv=5, scoring='neg_mean_squared_error')
mean_cv_score = -cross_val_scores.mean()

print("Cross-Validation MSE (Linear Regression):", mean_cv_score)


Cross-Validation MSE (Random Forest): 2.245927765613893
Cross-Validation MSE (Decision Tree): 2.97613197340517
Cross-Validation MSE (Linear Regression): 5.345079062518983e+22


### 3. Evaluation and Deployment (Depending on Classification or Regression)

#### 3.1. Evaluation
- (Add code here to generate evaluations.)

In [38]:
# Compare the models based on MSE, R², and MAE
models = {
    "Random Forest": {"MSE": mse_M0, "R²": r2_M0, "MAE": mae_M0},
    "Decision Tree": {"MSE": mse_M1, "R²": r2_M1, "MAE": mae_M1},
    "Linear Regression": {"MSE": mse_M2, "R²": r2_M2, "MAE": mae_M2}
}

# Initialize variables to store best models
best_mse_model = None
best_r2_model = None
best_mae_model = None

# Compare for MSE (lower is better)
best_mse_value = float('inf')  # Start with a high value
for model, metrics in models.items():
    if metrics["MSE"] < best_mse_value:
        best_mse_value = metrics["MSE"]
        best_mse_model = model

# Compare for R² (higher is better)
best_r2_value = -float('inf')  # Start with a low value
for model, metrics in models.items():
    if metrics["R²"] > best_r2_value:
        best_r2_value = metrics["R²"]
        best_r2_model = model

# Compare for MAE (lower is better)
best_mae_value = float('inf')  # Start with a high value
for model, metrics in models.items():
    if metrics["MAE"] < best_mae_value:
        best_mae_value = metrics["MAE"]
        best_mae_model = model

# Output results
print(f"Best model by MSE: {best_mse_model}")
print(f"Best model by R²: {best_r2_model}")
print(f"Best model by MAE: {best_mae_model}")

Best model by MSE: Random Forest
Best model by R²: Random Forest
Best model by MAE: Random Forest
