In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import Lasso, LinearRegression, Ridge, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

In [None]:
# Load data 
data = pd.read_csv('M1_final.csv')
data.info()

In [None]:
## Here we can find some correlation between data features
corr_jfk = data.corr(numeric_only=True)
fig,ax = plt.subplots(figsize=(20,10), )
sns.heatmap(corr_jfk, cmap="coolwarm", linewidths=0.5, ax=ax, annot= True)
plt.title('Correlation of JKF data Feature ')


In [None]:

# Select numerical columns for histogram plotting
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Set up the plot size and layout
plt.figure(figsize=(15, 10))

# Plot histograms for the first 6 numerical features
for i, col in enumerate(numerical_columns[:16], 1):
    plt.subplot(4,4,i)
    plt.hist(data[col].dropna(), bins=30, color='skyblue', edgecolor='black', )
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:

# Define features and the target
features = ['DEP_DELAY', 'DISTANCE', 'CRS_DEP_M', 'Wind Speed', 'Humidity', 'sch_dep', 'sch_arr']
target = 'TAXI_OUT'

# Prepare dictionary to store values for each feature and model
mse_results = {feature: {} for feature in features}
r2_results = {feature: {} for feature in features} 
mapr_results = {feature: {} for feature in features}

# Define models to use
models = {
    'Linear Regression': LinearRegression(),
    'Gradient Descent': SGDRegressor(max_iter=1000, tol=1e-3),
    'Lasso': Lasso(alpha=0.1),
    'Ridge': Ridge(alpha=0.1),
}

# Iterate over each feature and train each model
for feature in features:
    X = data[[feature]]
    y = data[target]

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        # Fit model
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test,y_pred)
        absolute_per_error = mean_absolute_percentage_error(y_test, y_pred)
    
        # Store MSE result
        mse_results[feature][model_name] = mse
        r2_results[feature][model_name] = r2
        mapr_results[feature][model_name] = absolute_per_error

# Convert MSE results to a DataFrame for visualization
mse_df = pd.DataFrame(mse_results).T
r2_score_df = pd.DataFrame(r2_results).T * 100
mapr_df = pd.DataFrame(mapr_results).T
# Plot MSE for each feature and model
mse_df.plot(kind='line', figsize=(10, 6))
plt.title("Model Evaluation (MSE) for Each Feature")
plt.xlabel("Features")
plt.ylabel("Mean Squared Error (MSE)")
plt.legend(title="Model")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Plot r2 for each feature and model
r2_score_df.plot(kind='bar', figsize=(10, 6))
plt.title("Model Evaluation (r2_score) for Each Feature")
plt.xlabel("Features")
plt.ylabel("r2 score")
plt.legend(title="Model")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

mapr_df.plot(kind='line', figsize=(10, 6))
plt.title("Model Evaluation (Mean_absolute_perc_error ) for Each Feature")
plt.xlabel("Features")
plt.ylabel("map_error")
plt.legend(title="Model")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()