#### importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#### DATASET WITH DELETED NULL ROWS

In [2]:
# Read the dataset and drop the 'Effort (Actual)' column
df = pd.read_csv('Effort estimation data set.csv')

# Store the 'Effort (Actual)' column separately
effort_actual_column = df['Effort (Actual)']

# Drop the 'Effort (Actual)' column
df.drop(['Effort (Actual)'], axis=1, inplace=True)

# Calculate the number of cells to delete (10% of total cells)
total_cells = df.size
cells_to_delete = int(0.1 * total_cells)

# Generate random row and column indices to select cells for deletion
indices_to_delete = np.random.choice(df.index, size=cells_to_delete, replace=True)

# Set the selected cells as NaN
for idx in indices_to_delete:
    row_idx, col_idx = np.random.randint(0, df.shape[0]), np.random.randint(0, df.shape[1])
    df.iat[row_idx, col_idx] = np.nan

# Append the 'Effort (Actual)' column to the modified DataFrame
df['Effort (Actual)'] = effort_actual_column

# Display the resulting DataFrame
df

Unnamed: 0,NOA,NEM,NSR,CP2,Effort (Actual)
0,170.0,142.0,97.0,110.55,286.00
1,292.0,409.0,295.0,242.54,396.00
2,929.0,821.0,,446.60,471.00
3,755.0,975.0,723.0,760.96,1016.00
4,,997.0,,1242.60,1261.00
...,...,...,...,...,...
67,94.0,52.0,28.0,49.86,100.85
68,,37.0,,29.26,47.15
69,34.0,23.0,17.0,20.68,44.83
70,110.0,67.0,36.0,,128.27


In [3]:
from sklearn.impute import SimpleImputer

# Define a function to calculate normalized mean squared error
def normalized_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) / np.var(y_true)

# Define a function to calculate performance metrics
def evaluate_performance(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    nmse = normalized_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, mape, nmse, r2

# Copy the original DataFrame to preserve the original data
df_imputed = df.copy()

# Initialize empty DataFrames for each imputation strategy
df_mean = pd.DataFrame()
df_median = pd.DataFrame()
df_mode = pd.DataFrame()

# Impute missing values with mean, median, and mode
impute_strategies = ['mean', 'median', 'most_frequent']
for strategy in impute_strategies:
    imputer = SimpleImputer(strategy=strategy)
    df_imputed_filled = pd.DataFrame(imputer.fit_transform(df_imputed), columns=df_imputed.columns)
    
    # Split the data into training and testing sets
    X_imputed = df_imputed_filled.drop(columns=['Effort (Actual)'])
    y_imputed = df_imputed_filled['Effort (Actual)']
    X_train_imputed, X_test_imputed, y_train_imputed, y_test_imputed = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=42)
    
    # Train models on imputed data
    lr_imputed = LinearRegression()
    lr_imputed.fit(X_train_imputed, y_train_imputed)
    lr_pred_imputed = lr_imputed.predict(X_test_imputed)

    dt_imputed = DecisionTreeRegressor()
    dt_imputed.fit(X_train_imputed, y_train_imputed)
    dt_pred_imputed = dt_imputed.predict(X_test_imputed)

    mlp_imputed = MLPRegressor()
    mlp_imputed.fit(X_train_imputed, y_train_imputed)
    mlp_pred_imputed = mlp_imputed.predict(X_test_imputed)

    svr_imputed = SVR()
    svr_imputed.fit(X_train_imputed, y_train_imputed)
    svr_pred_imputed = svr_imputed.predict(X_test_imputed)

    svr_sigmoid_imputed = SVR(kernel='sigmoid')
    svr_sigmoid_imputed.fit(X_train_imputed, y_train_imputed)
    svr_sigmoid_pred_imputed = svr_sigmoid_imputed.predict(X_test_imputed)

    svr_poly_imputed = SVR(kernel='poly')
    svr_poly_imputed.fit(X_train_imputed, y_train_imputed)
    svr_poly_pred_imputed = svr_poly_imputed.predict(X_test_imputed)

    svr_rbf_imputed = SVR(kernel='rbf')
    svr_rbf_imputed.fit(X_train_imputed, y_train_imputed)
    svr_rbf_pred_imputed = svr_rbf_imputed.predict(X_test_imputed)
    
    # Evaluate performance metrics for imputed data
    models_imputed = [lr_imputed, dt_imputed, mlp_imputed, svr_imputed, svr_sigmoid_imputed, svr_poly_imputed, svr_rbf_imputed]
    predictions_imputed = [lr_pred_imputed, dt_pred_imputed, mlp_pred_imputed, svr_pred_imputed, svr_sigmoid_pred_imputed, svr_poly_pred_imputed, svr_rbf_pred_imputed]
    model_names = ['Linear Regression', 'Decision Tree Regressor', 'MLP Regressor', 'SVR', 'SMO with Sigmoid Kernel', 'SMO with polynomial Kernel', 'SMO with RBF Kernel']
    
    results_imputed = []
    for model_imputed, pred_imputed, name in zip(models_imputed, predictions_imputed, model_names):
        mae_imputed, mse_imputed, mape_imputed, nmse_imputed, r2_imputed = evaluate_performance(y_test_imputed, pred_imputed)
        results_imputed.append([name, mae_imputed, mse_imputed, mape_imputed, nmse_imputed, r2_imputed])
    
    # Create DataFrame from imputed results
    df_results_imputed = pd.DataFrame(results_imputed, columns=["Model", "Mean Absolute Error", "Mean Squared Error", "Mean Absolute Percentage Error", "Normalized Mean Squared Error", "R^2 Score"])
    
    # Store the results in the corresponding DataFrames
    if strategy == 'mean':
        df_mean = df_results_imputed
    elif strategy == 'median':
        df_median = df_results_imputed
    elif strategy == 'most_frequent':
        df_mode = df_results_imputed

#### Print results for each imputation strategy

In [4]:
print("Mean Imputation:")
df_mean

Mean Imputation:


Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,Mean Absolute Percentage Error,Normalized Mean Squared Error,R^2 Score
0,Linear Regression,73.410476,8386.925532,0.859164,0.085716,0.914284
1,Decision Tree Regressor,117.043333,47986.63438,0.35483,0.490432,0.509568
2,MLP Regressor,81.138888,10260.330711,0.900344,0.104862,0.895138
3,SVR,236.606571,95782.269642,3.775699,0.978911,0.021089
4,SMO with Sigmoid Kernel,239.068228,97215.055397,3.774764,0.993555,0.006445
5,SMO with polynomial Kernel,170.553974,43643.573004,1.377371,0.446045,0.553955
6,SMO with RBF Kernel,236.606571,95782.269642,3.775699,0.978911,0.021089


In [5]:
print("Median Imputation:")
df_median

Median Imputation:


Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,Mean Absolute Percentage Error,Normalized Mean Squared Error,R^2 Score
0,Linear Regression,91.212161,12356.578293,0.816397,0.126286,0.873714
1,Decision Tree Regressor,139.954667,64276.26452,0.416957,0.656915,0.343085
2,MLP Regressor,70.690877,8714.950962,0.611477,0.089068,0.910932
3,SVR,236.438787,95624.910222,3.752556,0.977303,0.022697
4,SMO with Sigmoid Kernel,239.224126,96916.2646,3.78015,0.990501,0.009499
5,SMO with polynomial Kernel,182.121442,55834.66085,1.389526,0.57064,0.42936
6,SMO with RBF Kernel,236.438787,95624.910222,3.752556,0.977303,0.022697


In [6]:
print("Mode Imputation:")
df_mode

Mode Imputation:


Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,Mean Absolute Percentage Error,Normalized Mean Squared Error,R^2 Score
0,Linear Regression,103.603905,18825.040996,0.677538,0.192395,0.807605
1,Decision Tree Regressor,151.488,104029.643973,0.364987,1.063201,-0.063201
2,MLP Regressor,97.17788,17450.40258,0.294951,0.178346,0.821654
3,SVR,236.44922,95588.241715,3.741271,0.976928,0.023072
4,SMO with Sigmoid Kernel,239.033778,96587.294111,3.765789,0.987139,0.012861
5,SMO with polynomial Kernel,187.790762,64647.825043,1.520016,0.660712,0.339288
6,SMO with RBF Kernel,236.44922,95588.241715,3.741271,0.976928,0.023072


Linear Regression and MLP Regressor show relatively low errors (MAE, MSE) and high R^2 scores, indicating better performance in predicting the target variable compared to other models.

Decision Tree Regressor performs moderately well but shows higher errors and lower R^2 score compared to Linear Regression and MLP Regressor.

Support Vector Regression (SVR) and SMO with Sigmoid Kernel exhibit significantly higher errors and very low R^2 scores, indicating poor performance in predicting the target variable.

SMO with Polynomial Kernel shows moderate performance with lower errors compared to SVR and SMO with Sigmoid Kernel, but still not as good as Linear Regression and MLP Regressor.