In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# Load the dataset
test_data = pd.read_csv('/Users/kamayanirai/Downloads/test.csv (1).zip')
train_data = pd.read_csv('/Users/kamayanirai/Downloads/train.csv (1).zip')

In [49]:
train_data.isnull().sum()

id          0
cont1       0
cont2       0
cont3       0
cont4       0
           ..
cat116_U    0
cat116_V    0
cat116_W    0
cat116_X    0
cat116_Y    0
Length: 1192, dtype: int64

In [51]:
test_data.shape

(125546, 1192)

In [52]:
train_data.shape

(188318, 1192)

In [50]:
test_data.isnull().sum()

id          0
cont1       0
cont2       0
cont3       0
cont4       0
           ..
cat116_U    0
cat116_V    0
cat116_W    0
cat116_X    0
cat116_Y    0
Length: 1192, dtype: int64

In [4]:
# Prepare data
X = train_data.drop(['loss', 'id'], axis=1)
y = train_data['loss']

In [5]:
train_data.dtypes

id          int64
cat1       object
cat2       object
cat3       object
cat4       object
           ...   
cont11    float64
cont12    float64
cont13    float64
cont14    float64
loss      float64
Length: 132, dtype: object

In [6]:
test_data.dtypes

id          int64
cat1       object
cat2       object
cat3       object
cat4       object
           ...   
cont10    float64
cont11    float64
cont12    float64
cont13    float64
cont14    float64
Length: 131, dtype: object

In [7]:
# Get all columns with dtype 'object'
object_columns = train_data.select_dtypes(include=['object']).columns
print("Columns with object dtype:", object_columns)

Columns with object dtype: Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10',
       ...
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116'],
      dtype='object', length=116)


In [8]:
# Combine train and test data for consistent one-hot encoding
combined_data = pd.concat([train_data, test_data], keys=[0, 1])

# List of categorical columns for one-hot encoding
categorical_features = [col for col in combined_data.columns if "cat" in col]

# Perform one-hot encoding
combined_data = pd.get_dummies(combined_data, columns=categorical_features)

In [9]:
# Separate combined data back into train and test sets
train_data = combined_data.xs(0)
test_data = combined_data.xs(1)

# Extract target variable and drop unnecessary columns
y = train_data['loss']
X = train_data.drop(['loss', 'id'], axis=1)

In [None]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for DecisionTreeRegressor using RandomizedSearchCV
param_dist_dt = {
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
   # 'min_samples_leaf': [2, 4],
   # 'criterion': ['squared_error']
}
random_search_dt = RandomizedSearchCV(DecisionTreeRegressor(random_state=42), param_dist_dt,n_iter=5, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search_dt.fit(X_train, y_train)

# Best parameters and model for DecisionTreeRegressor
print(f"Best parameters for Decision Tree: {random_search_dt.best_params_}")
best_dt = random_search_dt.best_estimator_

# Evaluate the best DecisionTreeRegressor model
dt_predict_Train = best_dt.predict(X_train)
rmse_train_dt = np.sqrt(mean_squared_error(y_train, dt_predict_Train))
print("RMSE (training) for Decision Tree: {0:10f}".format(rmse_train_dt))

dt_predict_Test = best_dt.predict(X_val)
rmse_test_dt = np.sqrt(mean_squared_error(y_val, dt_predict_Test))
print("RMSE (Test Data) for Decision Tree: {0:10f}".format(rmse_test_dt))



Best parameters for Decision Tree: {'min_samples_split': 10, 'max_depth': 10}
RMSE (training) for Decision Tree: 1991.962836
RMSE (Test Data) for Decision Tree: 2073.090850


In [12]:
X_test = test_data.drop(['id'], axis=1)  # Drop the 'id' column from the test data
X_test = X_test[X_train.columns]  # Align columns of X_test with X_train columns

# Make predictions for best models on test data
dt_predict_Test = best_dt.predict(X_test)

# Save predictions for the model to CSV files
df_dt = pd.DataFrame({'ID': test_data['id'], 'loss': dt_predict_Test})
df_dt.to_csv('/Users/kamayanirai/Downloads/output1/dt_predictions_tuned.csv', index=False)
print("Decision Tree Predictions saved to 'dt_predictions_tuned.csv'.")

Decision Tree Predictions saved to 'dt_predictions_tuned.csv'.


In [13]:
# Stratified Sampling
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in split.split(train_data, pd.qcut(train_data['loss'], q=5, duplicates='drop')):  
    feature_selection_data = train_data.iloc[train_idx]
    validation_data = train_data.iloc[val_idx]

# Step 2: Feature and Target Extraction
# For feature selection
X_train_fs = feature_selection_data.drop(['loss', 'id'], axis=1)
y_train_fs = feature_selection_data['loss']

# For validation
X_val = validation_data.drop(['loss', 'id'], axis=1)
y_val = validation_data['loss']

##Feature Selection for Decision Tre

In [14]:
 #Training a Decision Tree Regressor for Feature Selection
clf = DecisionTreeRegressor(max_depth=5, random_state=42)
clf.fit(X_train_fs, y_train_fs)

# Feature Importances
importances = clf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]  # Descending order
top_n = 10 
top_features = X_train_fs.columns[sorted_indices[:top_n]]

print(f"\nTop {top_n} features by importance:\n", top_features)

# Select Top Features for Training and Validation
X_train_fs = X_train_fs[top_features]
X_val_fs = X_val[top_features]

# Train and Evaluate Final Model
clf = DecisionTreeRegressor(random_state=42)
clf.fit(X_train_fs, y_train_fs)

# Predictions on training data
clf_predict_train = clf.predict(X_train_fs)
rmse_train = np.sqrt(mean_squared_error(y_train_fs, clf_predict_train))
print(f"RMSE (Training) for Decision Tree with top features: {rmse_train:.6f}")

# Predictions on validation data
clf_predict_val = clf.predict(X_val_fs)
rmse_val = np.sqrt(mean_squared_error(y_val, clf_predict_val))
print(f"RMSE (Validation) for Decision Tree with top features: {rmse_val:.6f}")



Top 10 features by importance:
 Index(['cat80_B', 'cat57_A', 'cont7', 'cat79_D', 'cat12_B', 'cont2', 'cat81_D',
       'cat1_A', 'cont12', 'cat53_A'],
      dtype='object')
RMSE (Training) for Decision Tree with top features: 1323.339804
RMSE (Validation) for Decision Tree with top features: 2754.754259


In [15]:
# Defining and train the Random Forest Regressor
rfc = RandomForestRegressor(
    n_estimators=300, 
    max_depth=10,  
    max_features='sqrt',
    n_jobs=-1,  
    random_state=42
)
rfc.fit(X_train, y_train)

# Predictions and RMSE on training data
rfc_predict_Train = rfc.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, rfc_predict_Train))
print(f"RMSE (Training) for Random Forest: {rmse_train:.6f}")

# Predictions and RMSE on validation data
rfc_predict_Val = rfc.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, rfc_predict_Val))
print(f"RMSE (Validation) for Random Forest: {rmse_val:.6f}")

# Predictions on test data
X_test_fs = X_test[X_train.columns] 
rfc_predict_Test = rfc.predict(X_test_fs)

# Save predictions for test data
df_rf = pd.DataFrame({'ID': test_data['id'], 'loss': rfc_predict_Test})
df_rf.to_csv('/Users/kamayanirai/Downloads/output1/rf_predictions_tuned.csv', index=False)
print("Random Forest Predictions saved to 'rf_predictions_tuned.csv'.")

RMSE (Training) for Random Forest: 2068.710445
RMSE (Validation) for Random Forest: 2102.489282
Random Forest Predictions saved to 'rf_predictions_tuned.csv'.


##Feature Selection for Random Forest

In [16]:
#Training a random forest Regressor for Feature Selection
rfc = RandomForestRegressor()
rfc.fit(X_train_fs, y_train_fs)

# Feature Importances
importances = rfc.feature_importances_
sorted_indices = np.argsort(importances)[::-1]  # Descending order
top_n = 10 
top_features = X_train_fs.columns[sorted_indices[:top_n]]

print(f"\nTop {top_n} features by importance:\n", top_features)

#Select Top Features for Training and Validation
X_train_fs = X_train_fs[top_features]
X_val_fs = X_val[top_features]

#Train and Evaluate Final Model
rfc = RandomForestRegressor(random_state=42)
rfc.fit(X_train_fs, y_train_fs)

# Predictions on training data
rfc_predict_train = rfc.predict(X_train_fs)
rmse_train = np.sqrt(mean_squared_error(y_train_fs, rfc_predict_train))
print(f"RMSE (Training) for Random Forest with top features: {rmse_train:.6f}")

# Predictions on validation data
rfc_predict_val = rfc.predict(X_val_fs)
rmse_val = np.sqrt(mean_squared_error(y_val, rfc_predict_val))
print(f"RMSE (Validation) for Random Forest with top features: {rmse_val:.6f}")


Top 10 features by importance:
 Index(['cont7', 'cat80_B', 'cont12', 'cont2', 'cat57_A', 'cat79_D', 'cat12_B',
       'cat81_D', 'cat1_A', 'cat53_A'],
      dtype='object')
RMSE (Training) for Random Forest with top features: 1438.122217
RMSE (Validation) for Random Forest with top features: 2289.821045


In [17]:
# Define and train the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=100, 
    max_depth=3, 
    learning_rate=0.1, 
    random_state=42
)
gbr.fit(X_train, y_train)

# Predictions and RMSE on training data
gbr_predict_Train = gbr.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, gbr_predict_Train))
print(f"RMSE (Training) for Gradient Boosting: {rmse_train:.6f}")

# Predictions and RMSE on validation data
gbr_predict_Val = gbr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, gbr_predict_Val))
print(f"RMSE (Validation) for Gradient Boosting: {rmse_val:.6f}")

# Predictions on test data
X_test_fs = X_test[X_train.columns] 
gbr_predict_Test = gbr.predict(X_test_fs)

# Save predictions for test data
df_gbr = pd.DataFrame({'ID': test_data['id'], 'loss': gbr_predict_Test})
df_gbr.to_csv('/Users/kamayanirai/Downloads/output1/gbr_predictions_tuned.csv', index=False)
print("Gradient Boosting Predictions saved to 'gbr_predictions_tuned.csv'.")


RMSE (Training) for Gradient Boosting: 1923.054091
RMSE (Validation) for Gradient Boosting: 1937.852621
Gradient Boosting Predictions saved to 'gbr_predictions_tuned.csv'.


In [None]:
##Feature Selection for Gradient Boosting

In [18]:
#Training a Gradient Boosting Regressor for Feature Selection
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train_fs, y_train_fs)  # Fit the model first

# Feature Importances
importances = gbr.feature_importances_  # Access feature importances after fitting
sorted_indices = np.argsort(importances)[::-1]  # Descending order
top_n = 10  # Number of top features to select
top_features = X_train_fs.columns[sorted_indices[:top_n]]

print(f"\nTop {top_n} features by importance:\n", top_features)

# Select Top Features for Training and Validation
X_train_fs = X_train_fs[top_features]
X_val_fs = X_val[top_features]

# Train and Evaluate Final Model
final_gbr = GradientBoostingRegressor(random_state=42)
final_gbr.fit(X_train_fs, y_train_fs)

# Predictions on training data
gbr_predict_train = final_gbr.predict(X_train_fs)
rmse_train = np.sqrt(mean_squared_error(y_train_fs, gbr_predict_train))
print(f"RMSE (Training) for Gradient Boosting with top features: {rmse_train:.6f}")

# Predictions on validation data
gbr_predict_val = final_gbr.predict(X_val_fs)
rmse_val = np.sqrt(mean_squared_error(y_val, gbr_predict_val))
print(f"RMSE (Validation) for Gradient Boosting with top features: {rmse_val:.6f}")



Top 10 features by importance:
 Index(['cat80_B', 'cat79_D', 'cont7', 'cat57_A', 'cat12_B', 'cont2', 'cat81_D',
       'cont12', 'cat1_A', 'cat53_A'],
      dtype='object')
RMSE (Training) for Gradient Boosting with top features: 2038.500568
RMSE (Validation) for Gradient Boosting with top features: 2144.156510


In [28]:
from vecstack import stacking
models = [ GradientBoostingRegressor(), RandomForestRegressor(), DecisionTreeRegressor() ]
      
S_Train, S_Test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=True, mode='oof_pred_bag', needs_proba=False, save_dir=None, shuffle=True, 
                           random_state=42, n_folds=3, verbose=2)

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [GradientBoostingRegressor]
    fold  0:  [1259.90416894]
    fold  1:  [1256.31732515]
    fold  2:  [1256.14936832]
    ----
    MEAN:     [1257.45695414] + [1.73180014]
    FULL:     [1257.45695414]

model  1:     [RandomForestRegressor]
    fold  0:  [1255.20610070]
    fold  1:  [1243.36057840]
    fold  2:  [1248.08351498]
    ----
    MEAN:     [1248.88339803] + [4.86887799]
    FULL:     [1248.88339803]

model  2:     [DecisionTreeRegressor]
    fold  0:  [1744.58055179]
    fold  1:  [1737.58924330]
    fold  2:  [1740.87878211]
    ----
    MEAN:     [1741.01619240] + [2.85584311]
    FULL:     [1741.01619240]



In [39]:
#STACKING - CONTRUCT A GRADIENT BOOSTING MODEL
model = GradientBoostingRegressor()
    
model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)

In [None]:

# Predictions and RMSE on training data
y_pred_train = model.predict(S_Train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"RMSE (Training) for Gradient Boosting (stacked model): {rmse_train:.6f}")

X_test_fs = X_test.iloc[:, :S_Train.shape[1]]  # Select the same number of features as S_Train
gbr_predict_Test = model.predict(X_test_fs)

# Save predictions for test data
df_gbr = pd.DataFrame({'ID': test_data['id'], 'loss': gbr_predict_Test})
df_gbr.to_csv('/Users/kamayanirai/Downloads/output1/gbr_predictions_tuned_stacked.csv', index=False)
print("Gradient Boosting Predictions saved to 'gbr_predictions_tuned_stacked.csv'.")

RMSE (Training) for Gradient Boosting (stacked model): 1900.183562
Gradient Boosting Predictions saved to 'gbr_predictions_tuned_stacked.csv'.


