In [None]:
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import os
    import requests
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedShuffleSplit
    import category_encoders as ce
    from sklearn.preprocessing import LabelEncoder
    import seaborn as sns
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


except ImportError as e:
    print(e)

# read data

In [None]:
df = pd.read_csv(r"Datasets\flight_data.csv",index_col=0)
df

In [None]:
df.info()

In [None]:
col_lst = []
for col in df.columns:
    if df[col].dtype== object:
        print(col,":",df[col].nunique())
        print(col,":",df[col].unique())
        col_lst.append(col)

        # df.nunique()

In [None]:
mapping_values = {'zero': 0,'one': 1,'two_or_more': 2,"Economy":0,"Business":1,'Delhi':0 ,'Mumbai':1, 'Bangalore':2, 'Kolkata':3, 'Hyderabad':4, 'Chennai':5,'SpiceJet':0, 'AirAsia':1, 'Vistara':2, 'GO_FIRST':3, 'Indigo':4, 'Air_India':5,'Evening':1, 'Early_Morning':2, 'Morning':0, 'Afternoon':3, 'Night':4, 'Late_Night':5}
df['stops'] = df['stops'].replace(mapping_values).astype(int)
df['class'] = df['class'].replace(mapping_values).astype(int)
df['source_city'] = df['source_city'].replace(mapping_values).astype(int)
df['destination_city'] = df['destination_city'].replace(mapping_values).astype(int)
df['airline'] = df['airline'].replace(mapping_values).astype(int)
df['departure_time'] = df['departure_time'].replace(mapping_values).astype(int)
df['arrival_time'] = df['arrival_time'].replace(mapping_values).astype(int)
df

In [None]:
df.info()

In [None]:
label_encoder = LabelEncoder()
df['flight'] = label_encoder.fit_transform(df['flight'])
df

In [None]:
values = list(df['flight'].unique())
values.sort()
plt.plot(values)

In [None]:
df['flight_decoded'] = label_encoder.inverse_transform(df['flight'])

df

In [None]:
encoding_decoding_pair = df[['flight',"flight_decoded"]].to_json()

In [None]:
encoding_decoding_pair

In [None]:
json_data = df[['flight',"flight_decoded"]].to_json(orient='records')

# Save the JSON data to a file
with open('encoding_decoding_pair.json', 'w') as file:
    file.write(json_data)

In [None]:
standardscaler = StandardScaler()
df["standardized_data"] = standardscaler.fit_transform(df[['flight']])

minmaxscaler = MinMaxScaler()
df["normalized_data"] = minmaxscaler.fit_transform(df[['flight']])

In [None]:
df

In [None]:
df['price'].plot()

In [None]:
corr_matrix = df.corr()
# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
train_set, test_set = train_test_split(df, test_size=0.001, random_state=42)
print("train",len(train_set))
print("test",len(test_set))

In [None]:
train_set.reset_index(drop=True, inplace= True)
train_set

In [None]:
X_train = train_set.drop(['price'],axis=1)
# X_train.reset_index(drop= True,inplace= True)
X_train


In [None]:
y_train = train_set['price']
y_train

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
test_set.reset_index(drop= True, inplace= True)
X_test = test_set.drop(['price'],axis= 1)
y_test = test_set['price']

In [None]:
price_pred = lin_reg.predict(X_test)

In [None]:


mae = mean_absolute_error(y_test, price_pred)
mse = mean_squared_error(y_test, price_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, price_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# housing_predictions = lin_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(price_pred, y_test)
# lin_mse

In [None]:
print(y_test.mean())
print(price_pred.mean())

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_test)), y_test, label='Actual Values', color='b', marker='o')
# Scatter plot for predicted values
plt.scatter(range(len(price_pred)), price_pred, label='Predicted Values', color='r', marker='x')
plt.title('Actual vs Predicted Values')
plt.xlabel('Sample Index')
plt.ylabel('Value')
plt.legend()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)


In [None]:
tree_pred = tree_reg.predict(X_test)

mae = mean_absolute_error(y_test, tree_pred)
mse = mean_squared_error(y_test, tree_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, tree_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# housing_predictions = lin_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(price_pred, y_test)
# lin_mse
tree_mse = mean_squared_error(y_test, tree_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
plt.figure(figsize=(10, 6))
# plt.plot(y_test, label='Actual Values', marker='o')
# plt.plot(tree_pred, label='Predicted Values', marker='x')
# Plotting actual values
# plt.plot(y_test, label='Actual Values', marker='o', linestyle='-', color='b')

# # Plotting predicted values
# plt.plot(tree_pred, label='Predicted Values', marker='x', linestyle='-', color='r')
plt.scatter(range(len(y_test)), y_test, label='Actual Values', color='b', marker='o')

# Scatter plot for predicted values
plt.scatter(range(len(tree_pred)), tree_pred, label='Predicted Values', color='r', marker='x')


plt.title('Actual vs Predicted Values')
plt.xlabel('Sample Index')
plt.ylabel('Value')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_test, y_test,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg,X_test, y_test,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)


In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)


In [None]:
forest_pred = forest_reg.predict(X_test)

mae = mean_absolute_error(y_test, forest_pred)
mse = mean_squared_error(y_test, forest_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, forest_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")


In [None]:
lin_scores = cross_val_score(forest_reg,X_test, y_test,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_


In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances