In [62]:
#### importing necessary libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#### DATASET WITH DELETED NULL ROWS

# Read the dataset and drop the 'Effort (Actual)' column
df = pd.read_csv('Effort estimation data set.csv')

# Store the 'Effort (Actual)' column separately
effort_actual_column = df['Effort (Actual)']

# Drop the 'Effort (Actual)' column
df.drop(['Effort (Actual)'], axis=1, inplace=True)

# Calculate the number of cells to delete (10% of total cells)
total_cells = df.size
cells_to_delete = int(0.1 * total_cells)

# Generate random row and column indices to select cells for deletion
indices_to_delete = np.random.choice(df.index, size=cells_to_delete, replace=True)

# Set the selected cells as NaN
for idx in indices_to_delete:
    row_idx, col_idx = np.random.randint(0, df.shape[0]), np.random.randint(0, df.shape[1])
    df.iat[row_idx, col_idx] = np.nan

# Append the 'Effort (Actual)' column to the modified DataFrame
df['Effort (Actual)'] = effort_actual_column

# Display the resulting DataFrame
df

#### Dropping rows with NaN values

df = df.dropna()
print(df.shape)
df

from sklearn.impute import SimpleImputer

# Handle missing values with linear interpolation
df_interpolated = df.interpolate(method='linear', axis=0)

# Display the DataFrame after linear interpolation
print("DataFrame after linear interpolation:")
print(df_interpolated)

#### Splitting the data into training and testing sets

X = df_interpolated.drop(columns=['Effort (Actual)'])
y = df_interpolated['Effort (Actual)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Linear Regression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

#### Decision Tree Regressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

#### MLP Regressor

mlp = MLPRegressor()
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)

#### SVR

svr = SVR()
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)

#### SMO with sigmoid kernel (SVR with sigmoid kernel)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svr_sigmoid = SVR(kernel='sigmoid')
svr_sigmoid.fit(X_train_scaled, y_train)
svr_sigmoid_pred = svr_sigmoid.predict(X_test_scaled)

#### SMO with polynomial kernel (SVR with polynomial kernel)

svr_poly = SVR(kernel='poly')
svr_poly.fit(X_train_scaled, y_train)
svr_poly_pred = svr_poly.predict(X_test_scaled)

#### SMO with RBF kernel (SVR with RBF kernel)

svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train_scaled, y_train)
svr_rbf_pred = svr_rbf.predict(X_test_scaled)

#### Evaluate performance metrics for the models

# Define a function to calculate normalized mean squared error
def normalized_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) / np.var(y_true)

# Define a function to calculate performance metrics
def evaluate_performance(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    nmse = normalized_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, mape, nmse, r2

# Evaluate performance metrics for the models
models = [lr, dt, mlp, svr, svr_sigmoid, svr_poly, svr_rbf]
predictions = [lr_pred, dt_pred, mlp_pred, svr_pred, svr_sigmoid_pred, svr_poly_pred, svr_rbf_pred]
model_names = ['Linear Regression', 'Decision Tree Regressor', 'MLP Regressor', 'SVR', 'SMO with Sigmoid Kernel', 'SMO with polynomial Kernel', 'SMO with RBF Kernel']

results = []
for model, pred, name in zip(models, predictions, model_names):
    mae, mse, mape, nmse, r2 = evaluate_performance(y_test, pred)
    results.append([name, mae, mse, mape, nmse, r2])

# Create DataFrame from results
df_results_interpolated = pd.DataFrame(results, columns=["Model", "Mean Absolute Error", "Mean Squared Error", "Mean Absolute Percentage Error", "Normalized Mean Squared Error", "R^2 Score"])

# Display the results for the models after linear interpolation
print("Performance metrics after linear interpolation:")
df_results_interpolated

(46, 5)
DataFrame after linear interpolation:
       NOA    NEM     NSR      CP2  Effort (Actual)
0    170.0  142.0    97.0   110.55           286.00
1    292.0  409.0   295.0   242.54           396.00
3    755.0  975.0   723.0   760.96          1016.00
5    400.0  225.0   181.0   180.84           261.00
6    402.0  589.0   944.0   645.60           993.00
7    260.0  262.0   167.0   208.56           552.00
8    385.0  697.0   929.0   905.00           998.00
11   682.0  789.0   362.0   766.29          1083.00
14   770.0  701.0   635.0   743.49           840.00
16    65.0   97.0   387.0    74.26           279.00
17   293.0  382.0   654.0   481.66           621.00
21   637.0  944.0   421.0   627.60           947.00
23   520.0  531.0   401.0   590.42           812.00
24   812.0  387.0   297.0   428.18           685.00
25   788.0  373.0   278.0   280.84           638.00
26  1633.0  724.0  1167.0  1719.25          1803.00
27   177.0  192.0   126.0   104.50           369.00
30   444.0  363.0 

Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,Mean Absolute Percentage Error,Normalized Mean Squared Error,R^2 Score
0,Linear Regression,77.181889,7978.330541,0.813411,0.055292,0.944708
1,Decision Tree Regressor,43.989,6093.53819,0.178196,0.04223,0.95777
2,MLP Regressor,48.937193,6646.435149,0.181819,0.046061,0.953939
3,SVR,340.166357,138794.938543,4.693324,0.961879,0.038121
4,SMO with Sigmoid Kernel,334.952873,133629.756728,4.658261,0.926083,0.073917
5,SMO with polynomial Kernel,296.430157,113470.898142,3.851487,0.786378,0.213622
6,SMO with RBF Kernel,340.095823,138811.044965,4.690375,0.961991,0.038009


Decision Tree Regressor:
It has the lowest Mean Absolute Error (MAE) and Mean Squared Error (MSE) among all models, indicating better accuracy in predicting the target variable.
It also has the highest R^2 Score, indicating the highest goodness of fit among all models.

MLP Regressor:
While having slightly higher errors compared to Decision Tree Regressor, MLP Regressor still performs well with relatively low errors and high R^2 Score.

Linear Regression:
Linear Regression shows good performance with low errors and a high R^2 Score, indicating its effectiveness in predicting the target variable.

SMO with Polynomial Kernel:
SMO with Polynomial Kernel performs better than the Support Vector Regression (SVR) and SMO with Sigmoid Kernel in terms of errors and R^2 Score, placing it in the fourth position.

SMO with RBF Kernel:
Despite having similar performance metrics as SVR and SMO with Sigmoid Kernel, SMO with RBF Kernel is placed higher due to the slightly lower Mean Absolute Error (MAE).

SMO with Sigmoid Kernel:
It performs better than SVR but still exhibits relatively high errors and a low R^2 Score.

SVR (Support Vector Regression):
SVR shows the poorest performance among all models, with significantly higher errors and a very low R^2 Score.

So, the order of models by performance from best to worst is:

Decision Tree Regressor

MLP Regressor

Linear Regression

SMO with Polynomial Kernel

SMO with RBF Kernel

SMO with Sigmoid Kernel

SVR (Support Vector Regression)