In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [39]:
# Function to apply log transformation to the COSMIC features
def transform_cosmic_features(df):
    # Apply log1p (log(1 + x)) to avoid issues with log(0)
    df.loc[:, 'COSMIC Read'] = np.log1p(df['COSMIC Read'].clip(lower=0))
    df.loc[:, 'COSMIC Write'] = np.log1p(df['COSMIC Write'].clip(lower=0))
    df.loc[:, 'COSMIC Entry'] = np.log1p(df['COSMIC Entry'].clip(lower=0))
    df.loc[:, 'COSMIC Exit'] = np.log1p(df['COSMIC Exit'].clip(lower=0))
    
    return df

# Load data
ISBG = pd.read_excel("./data/ISBSG-whole.xlsx", header=3)

# Select the required columns
cols_needed = ['Max Team Size', 'COSMIC Read', 'COSMIC Write', 'COSMIC Entry', 'COSMIC Exit', 
               'Functional Size', 'Project Elapsed Time', 'Development Platform', 'Primary Programming Language', 
               'Summary Work Effort']
ISBG_interest = ISBG[cols_needed]

# Drop rows where essential COSMIC features have missing values
df_clean = ISBG_interest.dropna(subset=["COSMIC Read", "COSMIC Write", "COSMIC Exit", "COSMIC Entry"])

# Separate features and target
cosmic_cols = ["COSMIC Read", "COSMIC Write", "COSMIC Entry", "COSMIC Exit"]
tech_cols = ['Max Team Size', 'Functional Size', 'Project Elapsed Time', 'Development Platform', 'Primary Programming Language']

X = df_clean[cosmic_cols + tech_cols]
y = df_clean["Summary Work Effort"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform the COSMIC features before the split
X_train_cosmic = transform_cosmic_features(X_train[cosmic_cols])
X_test_cosmic = transform_cosmic_features(X_test[cosmic_cols])

# Apply StandardScaler to the transformed COSMIC features
scaler = StandardScaler()
X_train_cosmic = scaler.fit_transform(X_train_cosmic)
X_test_cosmic = scaler.transform(X_test_cosmic)

#no imputations yet, do this later
X_train_tech = X_train[tech_cols]
X_test_tech = X_test[tech_cols]

In [44]:
param_grid_ann = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],  # Regularization term
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [1000, 2000],  # Increase max iterations
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 6, 9],  # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'subsample': [0.8, 0.9, 1.0],  # Proportion of data to sample
    'colsample_bytree': [0.8, 0.9, 1.0],  # Subsample for each tree
    'gamma': [0, 0.1, 0.2]  # Regularization term
}

param_grid_tree = {
    'max_depth': [3, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider for splits
}

In [45]:
# Initialize the models
mlp = MLPRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
tree = DecisionTreeRegressor(random_state=42)

# GridSearchCV for each model
grid_search_ann = GridSearchCV(estimator=mlp, param_grid=param_grid_ann, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_tree = GridSearchCV(estimator=tree, param_grid=param_grid_tree, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the models
grid_search_ann.fit(X_train_cosmic, y_train)
grid_search_xgb.fit(X_train_cosmic, y_train)
grid_search_tree.fit(X_train_cosmic, y_train)

# Get the best models
best_mlp = grid_search_ann.best_estimator_
best_xgb = grid_search_xgb.best_estimator_
best_tree = grid_search_tree.best_estimator_

In [48]:
# Get the best parameters found by GridSearchCV
best_params_ann = grid_search_ann.best_params_
best_params_xgb = grid_search_xgb.best_params_
best_params_tree = grid_search_tree.best_params_

print("Best parameters for ANN:", best_params_ann)
print("Best parameters for XGBoost:", best_params_xgb)
print("Best parameters for Decision Tree:", best_params_tree)

Best parameters for ANN: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'max_iter': 2000, 'solver': 'adam'}
Best parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best parameters for Decision Tree: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
# Access the best trained model (estimator)
best_mlp = grid_search_ann.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_
best_tree_model = grid_search_tree.best_estimator_

print("Best MLP model:", best_mlp)
print("Best XGBoost model:", best_xgb_model)
print("Best Decision Tree model:", best_tree_model)

Best MLP model: MLPRegressor(alpha=0.001, hidden_layer_sizes=(50, 50), max_iter=2000,
             random_state=42)
Best XGBoost model: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=200,
             n_jobs=None, num_parallel_tree=None, ...)
Best Decision Tree model: DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                   