In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('/content/train_data.csv')
test_df = pd.read_csv('/content/test_data.csv')

train_df.head(), test_df.head()

(   id   airline   flight source_city departure_time stops arrival_time  \
 0   1   Vistara   UK-810   Bangalore  Early_Morning   one        Night   
 1   2  SpiceJet  SG-5094   Hyderabad        Evening  zero        Night   
 2   3   Vistara   UK-846   Bangalore        Morning   one      Evening   
 3   4   Vistara   UK-706     Kolkata        Morning   one      Evening   
 4   5    Indigo  6E-5394     Chennai  Early_Morning  zero      Morning   
 
   destination_city     class  duration  days_left  price  
 0           Mumbai   Economy     14.25         21   7212  
 1          Kolkata   Economy      1.75          7   5292  
 2            Delhi  Business      9.58          5  60553  
 3        Hyderabad   Economy      6.75         28   5760  
 4           Mumbai   Economy      2.00          4  10712  ,
    id    airline  flight source_city departure_time stops   arrival_time  \
 0   1  Air_India  AI-765     Kolkata        Evening   one          Night   
 1   2    Vistara  UK-747       D

In [3]:
# Encoding the labels
label_encoders = {}
categorical_attribs = ["airline", "source_city", "departure_time", "stops", "arrival_time",
                        "destination_city", "class"]

for col in categorical_attribs:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoders[col] = le

# Preparing data for splitting
X = train_df.drop(["id", "flight", "price"], axis = 1)
y = train_df['price']

# Splitting data into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.2, random_state = 12)

# Creating and checking model regressor
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Testing model
y_pred = grid_search.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_valid, y_pred)

# Testing model on main test data
X_test = test_df.drop(["id", "flight"], axis = 1)
model_predict = grid_search.predict(X_test)

# Checking Mean absolute error

print(f"MAE: {mae}, RMSE: {rmse}, R² Score: {r2}")

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 2046.7564895208334, RMSE: 3894.2192376290145, R² Score: 0.9699769377775228


In [4]:
submission_file = pd.DataFrame({'id':test_df['id'], 'price':model_predict})

In [5]:
submission_file.to_csv('my_submission_012925.csv', index = False)