In [170]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from time import perf_counter
from contextlib import contextmanager

def load_and_preprocess_data():
    df = pd.read_csv('student_data.csv').dropna().drop_duplicates(keep="first")

    categorical_columns = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    y = df['G3']
    X = df.drop('G3', axis=1)
    return train_test_split(X, y, train_size=0.8, shuffle=True, random_state=271)

@contextmanager
def timing():
    start_time = perf_counter()
    yield lambda: perf_counter() - start_time
    print(f'Time: {perf_counter() - start_time:.3f} seconds')

def train_random_forest_regressor(X_train, Y_train):
    return RandomForestRegressor(max_depth=10, min_samples_split=10, n_estimators=100, random_state=271).fit(X_train, Y_train)

def evaluate_model_predictions(model, X, Y, dataset_name):
    predictions = model.predict(X)
    r2_result = r2_score(Y, predictions)
    print(f'R2 {dataset_name}: {r2_result}')

def main():
    X_train, X_test, Y_train, Y_test = load_and_preprocess_data()

    with timing():
        random_forest = train_random_forest_regressor(X_train, Y_train)

    evaluate_model_predictions(random_forest, X_train, Y_train, 'train')
    evaluate_model_predictions(random_forest, X_test, Y_test, 'test')

if __name__ == "__main__":
    main()


Time: 0.299 seconds
R2 train: 0.951781674694076
R2 test: 0.8321989640495713


In [185]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from time import perf_counter
from contextlib import contextmanager

def load_and_preprocess_data():
    df = pd.read_csv('student_data.csv').dropna().drop_duplicates(keep="first")

    categorical_columns = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    y = df['G3']
    X = df.drop('G3', axis=1)
    return train_test_split(X, y, train_size=0.8, shuffle=True, random_state=271)

@contextmanager
def timing():
    start_time = perf_counter()
    yield lambda: perf_counter() - start_time
    print(f'Time: {perf_counter() - start_time:.3f} seconds')

def train_gradient_boosting_regressor(X_train, Y_train):
    params = {
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'max_depth': 10,
        'min_samples_split': 10,
        'random_state': 271
    }
    return GradientBoostingRegressor(**params).fit(X_train, Y_train)

def evaluate_model_predictions(model, X, Y, dataset_name):
    predictions = model.predict(X)
    r2_result = r2_score(Y, predictions)
    print(f'R2 {dataset_name}: {r2_result}')

def main():
    X_train, X_test, Y_train, Y_test = load_and_preprocess_data()

    with timing():
        gradient_boosting = train_gradient_boosting_regressor(X_train, Y_train)

    evaluate_model_predictions(gradient_boosting, X_train, Y_train, 'train')
    evaluate_model_predictions(gradient_boosting, X_test, Y_test, 'test')

if __name__ == "__main__":
    main()


Time: 9.172 seconds
R2 train: 1.0
R2 test: 0.8355770090300072
