In [22]:
import pandas as pd

from math import sqrt

from mlxtend.regressor import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, BayesianRidge
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [25]:
final_dataset_path = "/home/magnus9102/Mostafa/Py/Github/data-science/mostafa_vahdani_bachelor_project/data/interim/1-2-final_flight_tickets_dataset.csv"

In [26]:
df = pd.read_csv(final_dataset_path)

In [27]:
# scaler = StandardScaler()
scaler = MinMaxScaler()

In [28]:
# X_t = scaler.fit_transform(X)

df[['duration_sec']] = scaler.fit_transform(df[['duration_sec']])
df[['day']] = scaler.fit_transform(df[['day']])
df[['month']] = scaler.fit_transform(df[['month']])
df[['year']] = scaler.fit_transform(df[['year']])
df[['flight_capacity']] = scaler.fit_transform(df[['flight_capacity']])
df[['ticket_price_T']] = scaler.fit_transform(df[['ticket_price_T']])

In [29]:
y = df['ticket_price_T']
X = df.drop('ticket_price_T', axis=1)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [41]:
linear_model = LinearRegression()
bayesian_ridge = BayesianRidge(max_iter=250, alpha_1=0.01)
elastic_net = ElasticNet(alpha=0.8, l1_ratio=0.7, max_iter=300)

model = StackingRegressor(regressors=[linear_model, bayesian_ridge], 
                          meta_regressor=linear_model)
model.fit(x_train, y_train)

In [42]:
y_train_pred = model.predict(x_train)
y_train_pred




array([0.1710419 , 0.1500318 , 0.07956402, ..., 0.16004439, 0.07647108,
       0.01312851])

In [37]:
y_train_pred_df = pd.DataFrame(y_train_pred, columns=["ticket_price_T"])
x_train_df = pd.DataFrame(x_train, columns=X.columns)
final_df = pd.concat([y_train_pred_df, x_train_df], axis=1)

final_df[['duration_sec']] = scaler.inverse_transform(final_df[['duration_sec']])
final_df[['day']] = scaler.inverse_transform(final_df[['day']])
final_df[['month']] = scaler.inverse_transform(final_df[['month']])
final_df[['year']] = scaler.inverse_transform(final_df[['year']])
final_df[['ticket_price_T']] = scaler.inverse_transform(final_df[['ticket_price_T']])


In [38]:
final_df["ticket_price_T"]

0        7.284436e+06
1        7.284436e+06
2        7.284436e+06
3        7.284436e+06
4        7.284436e+06
             ...     
13964             NaN
13965             NaN
13967             NaN
13968             NaN
13970             NaN
Name: ticket_price_T, Length: 13397, dtype: float64

In [11]:
# We have a regression problem so we have to use regression metrics to evaluate our prediction.
# We can't use roc_auc_score, classification_report or confusion_matrix bcz these are for classification problem not regression.
# The skill or performance of a regression model must be reported as an "error" in those predictions.


In [12]:
# As regression involves predicting continuous values rather than discrete class, in otherwise
# ‌regression refers to predictive modeling problems that involve predicting a numeric value.


In [13]:
# Evaluating Regression Models: 
# You have to calculate 'error metrics' for regression
# Calculating and reporting mean squared error, root mean squared error, and mean absolute error.


In [33]:
# The best possible score is 1 which is obtained when the predicted values are the same as the actual values.
"R-squared: ", round(r2_score(y_train, y_train_pred), 3)


('R-squared: ', 0.0)

In [34]:
# A perfect "mean squared error" value is 0.0, which means that all predictions matched the expected values exactly.
"Mean squared error: ", round(mean_squared_error(y_train, y_train_pred), 3)


('Mean squared error: ', 0.007)

In [35]:
# A perfect "root mean squared error" value is 0.0, which means that all predictions matched the expected values exactly.
"Root mean squared error: ", round(sqrt(mean_squared_error(y_train, y_train_pred)), 3)


('Root mean squared error: ', 0.083)

In [36]:
# A perfect "mean absolute error" value is 0.0, which means that all predictions matched the expected values exactly.
"Mean absolute error: ", round(mean_absolute_error(y_train, y_train_pred), 3)


('Mean absolute error: ', 0.058)