# data_before_CV_addFeature.ipynb

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from functools import reduce



# Data Loading

In [2]:
mean_std_skew_df = pd.read_csv('./train/new_meanStdSkew.csv')
median_kurtosis_entropy_area_df = pd.read_csv('./train/medianKurtosisEntropyArea.csv')
feature1_df = pd.read_csv('./24_1_result_#4.csv')
feature2_df = pd.read_csv('./24_2_result_#6.csv')
feature3_df = pd.read_csv('./24_3_result_#4.csv')
feature4_df = pd.read_csv('./24_4_result_#8.csv')
feature5_df = pd.read_csv('./25_1_result_#4.csv')
feature6_df = pd.read_csv('./26_1_result_#7.csv')
feature7_df = pd.read_csv('./29_1_result#2+4.csv')
feature8_df = pd.read_csv('./1_Apr_result_#4.csv')
feature9_df = pd.read_csv('./2_Apr_result_#5.csv')
feature10_df = pd.read_csv('./3_Apr_result_#9.csv')
feature11_df = pd.read_csv('./5_Apr_result_#9.csv')

# Data Merge & Split

In [3]:
# First, merge two DataFrames
temp_df = pd.merge(mean_std_skew_df, median_kurtosis_entropy_area_df, on='id')

# Then, merge the result with the next DataFrame, and so on
temp_df = pd.merge(temp_df, feature1_df, on='id')
temp_df = pd.merge(temp_df, feature2_df, on='id')
temp_df = pd.merge(temp_df, feature3_df, on='id')
temp_df = pd.merge(temp_df, feature4_df, on='id')
temp_df = pd.merge(temp_df, feature5_df, on='id')
temp_df = pd.merge(temp_df, feature6_df, on='id')
combined_features_df = pd.merge(temp_df, feature7_df, on='id')

# Continue with your process
train_df = pd.read_csv('./train/train_data.csv')
merged_df = pd.merge(combined_features_df, train_df, on='id')
X = merged_df.drop(columns=['id', 'OSmonth'])
y = merged_df['OSmonth']

# List of all DataFrames you want to merge
dfs = [mean_std_skew_df, median_kurtosis_entropy_area_df, feature1_df, feature2_df, feature3_df, feature4_df, feature5_df, feature6_df, feature7_df, feature8_df, feature9_df, feature10_df, feature11_df]

# Use reduce to apply pd.merge() to all items in the list
combined_features_df = reduce(lambda left, right: pd.merge(left, right, on='id'), dfs)

# Continue with your process
train_df = pd.read_csv('./train/train_data.csv')
merged_df = pd.merge(combined_features_df, train_df, on='id')
X = merged_df.drop(columns=['id', 'OSmonth'])
y = merged_df['OSmonth']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((157, 10414), (68, 10414), (157,), (68,))

# Model Training

In [4]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVM": make_pipeline(StandardScaler(), SVR(max_iter=300)),
    "MLP": make_pipeline(StandardScaler(), MLPRegressor(max_iter=100,random_state=42)),
    "KNN": KNeighborsRegressor()
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and gather results
cv_results = {model_name: {"MAE": [], "MSE": [], "R2": []} for model_name in models.keys()}

for name, model in models.items():
    # Cross-validation for MAE
    cv_score_mae = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')
    cv_results[name]["MAE"] = -cv_score_mae.mean()
    
    # Cross-validation for MSE
    cv_score_mse = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    cv_results[name]["MSE"] = -cv_score_mse.mean()
    
    # Cross-validation for R2
    cv_score_r2 = cross_val_score(model, X, y, cv=kf, scoring='r2')
    cv_results[name]["R2"] = cv_score_r2.mean()

# Printing cross-validation results
print("Model performance using Cross-Validation:")
for model, scores in cv_results.items():
    print(f"{model} - MAE: {scores['MAE']}, MSE: {scores['MSE']}, R2: {scores['R2']}")



Model performance using Cross-Validation:
Linear Regression - MAE: 379.80248312113014, MSE: 2990500.0453650295, R2: -1074.1584331502727
Decision Tree - MAE: 51.36, MSE: 4385.786666666667, R2: -0.8715320978971747
Random Forest - MAE: 37.09768888888889, MSE: 2204.369411555556, R2: 0.05212561592204936
Gradient Boosting - MAE: 37.36212537315545, MSE: 2252.5402828526007, R2: 0.03176539471460453
SVM - MAE: 37.87179730700804, MSE: 2376.387460705425, R2: -0.01173371446054845
MLP - MAE: 4616.594382103597, MSE: 851274380.8464074, R2: -437461.13782417
KNN - MAE: 42.254222222222225, MSE: 2902.632533333333, R2: -0.25758137945403436
