In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
file_path = '../student-mat.csv'
data = pd.read_csv(file_path, delimiter=';')


In [2]:
# Binary classification problem 1: Predicting high/low grade based on median G3 value
median_g3 = data['G3'].median()
data['high_performance'] = (data['G3'] > median_g3).astype(int)

X = data.drop('high_performance', axis=1)
y = data['high_performance']

# Split dataset into training set and validation set (adjust test_size and random_state as needed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('train_data_problem1.csv', index=False)

# Save validation set
val_data = pd.concat([X_val, y_val], axis=1)
val_data.to_csv('val_data_problem1.csv', index=False)

Training set shape: (355, 33) (355,)
Validation set shape: (40, 33) (40,)


In [12]:
data = pd.read_csv(file_path, delimiter=';')

# One-hot encoding for categorical variables except for our binary targets
features = pd.get_dummies(data.drop(['G3', 'school', 'internet'], axis=1))
# Binary classification problem 2: Predicting school (already binary, just encode directly)
data['school_binary'] = (data['school'] == 'GP').astype(int)

X = data.drop('school_binary', axis=1)
y = data['school_binary']

# Split dataset into training set and validation set (adjust test_size and random_state as needed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('train_data_problem2.csv', index=False)

# Save validation set
val_data = pd.concat([X_val, y_val], axis=1)
val_data.to_csv('val_data_problem2.csv', index=False)

Training set shape: (355, 33) (355,)
Validation set shape: (40, 33) (40,)


In [13]:
 # Binary classification problem 3: Predicting internet access
data['internet_binary'] = (data['internet'] == 'yes').astype(int)
# One-hot encoding for categorical variables except for our binary targets
features = pd.get_dummies(data.drop(['G3', 'school', 'internet'], axis=1))


X = data.drop('internet_binary', axis=1)
y = data['internet_binary']

# Split dataset into training set and validation set (adjust test_size and random_state as needed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('train_data_problem3.csv', index=False)

# Save validation set
val_data = pd.concat([X_val, y_val], axis=1)
val_data.to_csv('val_data_problem3.csv', index=False)

Training set shape: (355, 34) (355,)
Validation set shape: (40, 34) (40,)


In [4]:
# Ridge Regression

import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

# Load the dataset
file_path = './train_data_problem2.csv'
data = pd.read_csv(file_path, delimiter=',')
print(data.head())
# # One-hot encoding for categorical variables except for our binary targets
# features = pd.get_dummies(data.drop(['G3', 'school', 'internet'], axis=1))

X = data.drop('school_binary', axis=1)
y = data['school_binary']

# Identifying categorical columns for one-hot encoding
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

# Creating the Ridge regression model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge_regression', Ridge())
])

# Setting up parameter grid for hyperparameter tuning
param_grid = {'ridge_regression__alpha': [0.01, 0.1, 1, 10, 100]}  # alpha is equivalent to λ

# Setting up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

  school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0     GP   F   15       R     GT3       T     1     1     other     other   
1     GP   F   15       U     GT3       T     1     1     other  services   
2     GP   F   17       U     GT3       T     4     4   teacher   teacher   
3     GP   M   16       U     LE3       T     2     2  services  services   
4     GP   M   16       R     GT3       T     2     2   at_home     other   

   ... freetime goout  Dalc  Walc  health absences  G1  G2  G3 school_binary  
0  ...        3     4     2     4       5        2   8   6   5             1  
1  ...        4     2     1     2       5        0   8  11  11             1  
2  ...        3     3     1     2       4        4  14  14  14             1  
3  ...        3     3     2     2       2        8   9   9   9             1  
4  ...        2     2     1     2       3        2  17  15  15             1  

[5 rows x 34 columns]
Best parameters: {'ridge_regression__alp

In [5]:
# SVR
from sklearn.svm import SVR  # Using SVR for regression

# Load the dataset with the correct delimiter
data = pd.read_csv('./train_data_problem2.csv', delimiter=',')


# Separate the features and the target variable
X = data.drop('school_binary', axis=1)
y = data['school_binary']

# Identifying categorical columns for one-hot encoding
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

# Creating the SVM model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVR(C=1.0))
])

# Setting up parameter grid for hyperparameter tuning (C values to test)
param_grid = {'svm__C': [0.1, 1, 10, 100, 1000]}

# Setting up GridSearchCV for 5-fold cross-validation and hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'svm__C': 100}
Best cross-validation score: -0.009191198986158142


In [6]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
# Load the dataset
data = pd.read_csv('./train_data_problem2.csv', delimiter=',')


# Separate the features and the target variable
X = data.drop('school_binary', axis=1)
y = data['school_binary']


# Identifying categorical columns for one-hot encoding
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

# Creating the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor())
])

# Define the parameter grid to test different hyperparameter values
param_grid = {
    'random_forest__n_estimators': [10, 50, 100, 200, 500]
}

# Setting Up GridSearchCV to find the best model configuration using 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'random_forest__n_estimators': 10}
Best cross-validation score: 0.0


In [7]:
# KNN
from sklearn.neighbors import KNeighborsRegressor 

# Load the dataset
data = pd.read_csv('./train_data_problem2.csv', delimiter=',')

X = data.drop('school_binary', axis=1)
y = data['school_binary']

# Identifying categorical columns for one-hot encoding
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

# Creating a modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])

# Define the parameter grid to test different values for n_neighbors
param_grid = {'knn__n_neighbors': [1, 3, 5, 7, 9]}

# Setting up GridSearchCV for 5-fold cross-validation and hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'knn__n_neighbors': 5}
Best cross-validation score: -0.07549295774647888


In [10]:
# 
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Load the dataset
data = pd.read_csv('./train_data_problem2.csv', delimiter=',')

X = data.drop('school_binary', axis=1)
y = data['school_binary']


# Identifying categorical columns and numerical columns for preprocessing
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=['int', 'float']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('scaler', StandardScaler(), numerical_columns)
    ],
    remainder='passthrough'
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ann', MLPRegressor(random_state=42, max_iter=1000, verbose=True, early_stopping=True))
])

# Define the parameter grid to test different learning rates
param_grid = {
    'ann__learning_rate_init': [0.1, 0.01, 0.001]
}

# Setting up GridSearchCV for 5-fold cross-validation and hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X, y)

# Printing the best cross-validation score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Iteration 1, loss = 33.24129597
Validation score: -119.769549
Iteration 2, loss = 5.88594222
Validation score: -6.071318
Iteration 3, loss = 0.60612405
Validation score: -7.999409
Iteration 4, loss = 0.71396912
Validation score: -8.544018
Iteration 5, loss = 0.75300629
Validation score: -8.829304
Iteration 6, loss = 0.77191917
Validation score: -8.629606
Iteration 7, loss = 0.78613807
Validation score: -8.306687
Iteration 8, loss = 0.75936833
Validation score: -7.927530
Iteration 9, loss = 0.71556695
Validation score: -7.263875
Iteration 10, loss = 0.65013388
Validation score: -5.939352
Iteration 11, loss = 0.54195711
Validation score: -3.724543
Iteration 12, loss = 0.38393698
Validation score: -2.220179
Iteration 13, loss = 0.25707614
Validation score: -3.165592
Iteration 14, loss = 0.23665719
Validation score: -0.571419
Iteration 15, loss = 0.13506934
Validation score: 0.008036
Iteration 16, loss = 0.07574562
Validation score: -0.073360
Iteration 17, loss = 0.05188290
Validation scor

In [9]:
# XG Boost
from xgboost import XGBRegressor

# Load the dataset
data = pd.read_csv('./train_data_problem2.csv', delimiter=',')

X = data.drop('school_binary', axis=1)
y = data['school_binary']

# Identifying categorical and numerical columns for preprocessing
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=['int', 'float']).columns

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('scaler', StandardScaler(), numerical_columns)
    ],
    remainder='passthrough'
)

# Creating a modeling pipeline with XGBoost
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# If you want to fine-tune hyperparameters, define a parameter grid
param_grid = {
    'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
}

# Setting up GridSearchCV for hyperparameter tuning (optional)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X, y)

# Printing the best parameters and cross-validation score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'xgb__learning_rate': 0.4}
Best cross-validation score: -1.6095606377774851e-09
