In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv("train.csv")

cols2use = ['Length', 'Height', 'Weight', 'Diameter', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
X = train[cols2use]
y = train['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
print(f'Linear Regression MAE: {mae_linear}')

# Ridge Regression
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print(f'Ridge Regression MAE: {mae_ridge}')

# Lasso Regression
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
print(f'Lasso Regression MAE: {mae_lasso}')

# Tree-Based Models: Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f'Random Forest MAE: {mae_rf}')

# Gradient Boosting
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
print(f'Gradient Boosting MAE: {mae_gb}')


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp_model = MLPRegressor(hidden_layer_sizes=(500, 10), max_iter=1000)
mlp_model.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_model.predict(X_test_scaled)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
print(f'StandardScaler (MLPRegressor) MAE: {mae_mlp}')


# Hyperparameter Tuning using GridSearchCV (Example with Random Forest)
param_grid = {
    'n_estimators':[100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
print(f'Best Random Forest MAE after Grid Search: {mae_best_rf}')


Linear Regression MAE: 1.3898479708860119
Ridge Regression MAE: 1.3891756017621029
Lasso Regression MAE: 1.6348103867676258
Random Forest MAE: 1.392796523809524
Gradient Boosting MAE: 1.35001728862648
StandardScaler (MLPRegressor) MAE: 1.3142534854021763
Best Random Forest MAE after Grid Search: 1.3402113719723412


In [4]:
test = pd.read_csv("test.csv")
subm = pd.read_csv("sample_submission.csv")

In [5]:
test_data = test[cols2use]
yy_pred = grid_search.predict(test_data)
yy_pred

array([ 6.56536141,  9.09176775,  5.40128461, ..., 10.91293237,
        9.53105076,  7.79015334])

In [6]:
subm['Age'] = yy_pred
subm

Unnamed: 0,id,Age
0,15000,6.565361
1,15001,9.091768
2,15002,5.401285
3,15003,8.732391
4,15004,6.637547
...,...,...
9995,24995,8.921372
9996,24996,8.306004
9997,24997,10.912932
9998,24998,9.531051
