In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import time

df = pd.read_excel("satisverileri.xlsx")
df.dropna(inplace=True)

X = df.drop(["Sales"], axis=1)
y = df["Sales"]

start_time = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2Score = r2_score(y_test, y_pred)
meanAbsScore = mean_absolute_error(y_test, y_pred)
meanSqScore = mean_squared_error(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)

end_time = time.time()

print("R2:" + str(r2Score))
print("Mean Absolute Error:" + str(meanAbsScore))
print("Mean Squared Error:" + str(meanSqScore))
print("MAPE:" + str(MAPE))
elapsed_time = end_time - start_time
print("Elapsed Time: ", elapsed_time, " seconds")

R2:0.702199240198245
Mean Absolute Error:495.0571784039604
Mean Squared Error:477792.757941697
MAPE:0.3286184339881124
Elapsed Time:  0.032193899154663086  seconds


In [51]:
# K-fold

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import time

df = pd.read_excel("satisverileri.xlsx")
df.dropna(inplace=True)

X = df.drop(["Sales"], axis = 1)
y = df["Sales"]

start_time = time.time()

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)

r2_scores = []
mae_scores = []
mse_scores = []
mape_scores = []
elapsed_times = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = GradientBoostingRegressor().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2Score = r2_score(y_test, y_pred)
    meanAbsScore = mean_absolute_error(y_test, y_pred)
    meanSqScore = mean_squared_error(y_test, y_pred)
    MAPE = mean_absolute_percentage_error(y_test, y_pred)
    
    r2_scores.append(r2Score)
    mae_scores.append(meanAbsScore)
    mse_scores.append(meanSqScore)
    mape_scores.append(MAPE)
    

mean_r2 = np.mean(r2_scores)
mean_mae = np.mean(mae_scores)
mean_mse = np.mean(mse_scores)
mean_mape = np.mean(mape_scores)

end_time = time.time()

print("R2 Scores:", mean_r2)
print("Mean Absolute Error Scores:", mean_mae)
print("Mean Squared Error Scores:", mean_mse)
print("MAPE Scores:", mean_mape)

elapsed_time = end_time - start_time
print("Elapsed Time:", elapsed_time, "seconds")

R2 Scores: 0.6737021891353526
Mean Absolute Error Scores: 484.4362132666573
Mean Squared Error Scores: 470364.2992729878
MAPE Scores: 0.3686926727326375
Elapsed Time: 0.3452916145324707 seconds


In [1]:
# K-fold and parameter optimizations

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import time

df = pd.read_excel("satisverileri.xlsx")
df.dropna(inplace=True)

X = df.drop(["Sales"], axis=1)
y = df["Sales"]

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)

max_leaf_nodes_values = [10, 20, 30, 40, 60]
max_depth_values = [2, 3, 4, 5, 10, 20]
min_samples_split_values = [2, 5, 10, 20, 30]
min_samples_leaf_values = [1, 5, 10, 20, 25]

max_mean_r2 = -np.inf
best_MaxLeafNodes = -1
best_Depth = -1
best_MinsamplesSplit = -1
best_MinSamplesLeaf = -1

start_time = time.time()

for leafNodes in max_leaf_nodes_values:
    for depth in max_depth_values:
        for split in min_samples_split_values:
            for leaf in min_samples_leaf_values:
                
                r2_scores = []
                mae_scores = []
                mse_scores = []
                mape_scores = []

                for train_idx, test_idx in kf.split(X):
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                    model = GradientBoostingRegressor(max_leaf_nodes=leafNodes, max_depth=depth, min_samples_split=split, min_samples_leaf=leaf)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    r2Score = r2_score(y_test, y_pred)
                    meanAbsScore = mean_absolute_error(y_test, y_pred)
                    meanSqScore = mean_squared_error(y_test, y_pred)
                    MAPE = mean_absolute_percentage_error(y_test, y_pred)

                    r2_scores.append(r2Score)
                    mae_scores.append(meanAbsScore)
                    mse_scores.append(meanSqScore)
                    mape_scores.append(MAPE)

                mean_r2 = np.mean(r2_scores)

                if mean_r2 > max_mean_r2:
                    max_mean_r2 = mean_r2
                    best_MaxLeafNodes = leafNodes
                    best_Depth = depth
                    best_MinsamplesSplit = split
                    best_MinSamplesLeaf = leaf
                
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed Time:", elapsed_time, "seconds")
print("Best Max Leaf Nodes:", best_MaxLeafNodes)
print("Best Depth:", best_Depth)
print("Best Min Samples Split:", best_MinsamplesSplit)
print("Best Min Samples Leaf:", best_MinSamplesLeaf)

print("Max Mean R2 Score:", max_mean_r2)

Elapsed Time: 299.599956035614 seconds
Best Max Leaf Nodes: 30
Best Depth: 2
Best Min Samples Split: 20
Best Min Samples Leaf: 20
Max Mean R2 Score: 0.7208099882076363


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import time

df = pd.read_excel("satisverileri.xlsx")
df.dropna(inplace=True)

X = df.drop(["Sales"], axis=1)
y = df["Sales"]

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)

max_leaf_nodes_values = [10, 20, 30, 40, 60]
max_depth_values = [2, 3, 4, 5, 10, 20]
min_samples_split_values = [2, 5, 10, 20, 30]
min_samples_leaf_values = [1, 5, 10, 20, 25]

max_mean_r2 = -np.inf
best_MaxLeafNodes = -1
best_Depth = -1
best_MinsamplesSplit = -1
best_MinSamplesLeaf = -1

best_mae = np.inf
best_mse = np.inf
best_rmse = np.inf

start_time = time.time()

for leafNodes in max_leaf_nodes_values:
    for depth in max_depth_values:
        for split in min_samples_split_values:
            for leaf in min_samples_leaf_values:
                r2_scores = []
                mae_scores = []
                mse_scores = []
                rmse_scores = []

                for train_idx, test_idx in kf.split(X):
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                    model = GradientBoostingRegressor(max_leaf_nodes=leafNodes, max_depth=depth, min_samples_split=split, min_samples_leaf=leaf)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)

                    r2Score = r2_score(y_test, y_pred)
                    meanAbsScore = mean_absolute_error(y_test, y_pred)
                    meanSqScore = mean_squared_error(y_test, y_pred)
                    rmseScore = np.sqrt(meanSqScore)

                    r2_scores.append(r2Score)
                    mae_scores.append(meanAbsScore)
                    mse_scores.append(meanSqScore)
                    rmse_scores.append(rmseScore)

                mean_r2 = np.mean(r2_scores)
                mean_mae = np.mean(mae_scores)
                mean_mse = np.mean(mse_scores)
                mean_rmse = np.mean(rmse_scores)

                if mean_r2 > max_mean_r2:
                    max_mean_r2 = mean_r2
                    best_MaxLeafNodes = leafNodes
                    best_Depth = depth
                    best_MinsamplesSplit = split
                    best_MinSamplesLeaf = leaf
                
                if mean_mae < best_mae:
                    best_mae = mean_mae

                if mean_mse < best_mse:
                    best_mse = mean_mse

                if mean_rmse < best_rmse:
                    best_rmse = mean_rmse
                
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed Time:", elapsed_time, "seconds")
print("Best Max Leaf Nodes:", best_MaxLeafNodes)
print("Best Depth:", best_Depth)
print("Best Min Samples Split:", best_MinsamplesSplit)
print("Best Min Samples Leaf:", best_MinSamplesLeaf)

print("Max Mean R2 Score:", max_mean_r2)
print("Best MAE:", best_mae)
print("Best MSE:", best_mse)
print("Best RMSE:", best_rmse)

Elapsed Time: 344.0132255554199 seconds
Best Max Leaf Nodes: 60
Best Depth: 2
Best Min Samples Split: 2
Best Min Samples Leaf: 20
Max Mean R2 Score: 0.7173273417767154
Best MAE: 459.1826414720278
Best MSE: 416302.3570900453
Best RMSE: 636.139947701092
