## Importing the data from data folder

In [68]:
import pandas as pd 
import numpy as np
import os

path_to_train_data = os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'data', 'train.csv')
path_to_test_data  = os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'data', 'test.csv')

train_df = pd.read_csv(path_to_train_data)
test_df  = pd.read_csv(path_to_test_data)

target_column = 'price'
X_train = train_df.drop(columns=target_column, axis=1)
y_train = train_df[target_column]

X_test  = test_df.drop(columns=target_column, axis=1)
y_test  = test_df[target_column]

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((8010, 9), (8010,), (2671, 9), (2671,))

## Transforming data
**Creating column transformer object for preprocessing**

In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder

def extract_day_and_month(X: pd.Series) : 
    X = X.astype('datetime64[ns]')
    return pd.DataFrame(
        {
            'journey_day' : X.dt.day,
            'journey_month' : X.dt.month
        }
    )

def calculate_duration_in_minutes(duration:str) -> int :
    time = duration.split(' ')

    if len(time) != 1 :
        return int(time[0].replace('h', '')) * 60 + int(time[1].replace('m', ''))
    
    return int(time[0].replace('h', '')) * 60

def convert_duration(X: pd.Series) : 
    return X.apply(calculate_duration_in_minutes).values.reshape(-1,1)

def convert_total_stops(X: pd.Series) : 
    return X.map({'non-stop':0, '1 stop':1, 
                        '2 stops':2, '3 stops':3, '4 stops':4}).values.reshape(-1,1)

def get_arrival_info(X: pd.Series):
    hour_and_minutes = X.str.split(' ').str[0].str.split(':')
    return pd.DataFrame(
        {
            'arrival_hour' : hour_and_minutes.str[0].astype(int),
            'arrival_minute' : hour_and_minutes.str[1].astype(int)
        }
    )

def get_departure_info(X: pd.Series) :
    hour_and_minutes = X.str.split(':')
    return pd.DataFrame(
        {    
            'departure_hour' : hour_and_minutes.str[0].astype(int),
            'departure_minute' : hour_and_minutes.str[1].astype(int)
        }
    )

def get_additional_info(X: pd.Series): 
    return X.str.lower().values.reshape(-1,1)


def get_transformed_df(array: np.ndarray):

    transformed_df = pd.DataFrame(array,
                                columns=['airline', 'source', 'destination', 'additional_info', 
                                        'duration', 'total_stops', 'journey_day', 'journey_month',
                                        'arrival_hour','arrival_minute', 'departure_hour' ,'departure_minute'
                                        ])
    return transformed_df.astype(int)


categorical_columns = ['airline', 'source', 'destination']

additional_info_pipeline = Pipeline(
    steps=[
        ('lower', FunctionTransformer(get_additional_info)),
        ('encode', OrdinalEncoder())
    ]
)

transformer = ColumnTransformer(
    transformers=[
        ('travel_info', OrdinalEncoder(), categorical_columns),
        ('additional_info', additional_info_pipeline, 'additional_info'),
        ('duration', FunctionTransformer(convert_duration), 'duration'),
        ('total_stops', FunctionTransformer(convert_total_stops), 'total_stops'),
        ('date_features', FunctionTransformer(extract_day_and_month), 'date_of_journey'),
        ('arrival_info', FunctionTransformer(get_arrival_info), 'arrival_time'),
        ('departure_info', FunctionTransformer(get_departure_info), 'dep_time'),
    ],
    remainder='passthrough'
)

transformer

**Loading preprocessor and Estimator**

In [71]:
from src.utils import load_object

path_to_preprocessor = os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'model', 'preprocessor.pkl')
path_to_estimator    = os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'model', 'regressor.pkl')

preprocessor = load_object(path_to_preprocessor)
model = load_object(path_to_estimator)


In [77]:
transformed_x_test = get_transformed_df(preprocessor.transform(X_test)) 
transformed_x_train = get_transformed_df(preprocessor.transform(X_train))

print(transformed_x_train.head(2))
print('='*100)
print(transformed_x_test.head(2))

   airline  source  destination  additional_info  duration  total_stops  \
0        8       1            4                7       135            0   
1        0       0            5                8       165            0   

   journey_day  journey_month  arrival_hour  arrival_minute  departure_hour  \
0            5              1            10              35               8   
1            3              1             2              10              23   

   departure_minute  
0                20  
1                25  
   airline  source  destination  additional_info  duration  total_stops  \
0        3       3            0                7       280            1   
1       10       3            0                7       700            1   

   journey_day  journey_month  arrival_hour  arrival_minute  departure_hour  \
0           24              3            19              50              15   
1            4              1            18              50               7   

   dep

In [86]:
X_test.iloc[45].values, y_test[45]

(array(['IndiGo', '21/03/2019', 'Banglore', 'New Delhi', '13:00', '15:50',
        '2h 50m', 'non-stop', 'No info'], dtype=object),
 np.int64(5694))

In [100]:
X_test[X_test.index == 1982], y_test[y_test.index == 1982]

(          airline date_of_journey   source destination dep_time  arrival_time  \
 1982  Jet Airways      21/05/2019  Kolkata    Banglore    21:10  19:50 22 May   
 
      duration total_stops              additional_info  
 1982  22h 40m      1 stop  In-flight meal not included  ,
 1982    10844
 Name: price, dtype: int64)

In [105]:
y_pred_test = model.predict(transformed_x_test)
y_pred_train = model.predict(transformed_x_train)

In [106]:
from sklearn.metrics import r2_score

test_score = r2_score(y_test, y_pred_test) *100
train_score = r2_score(y_train, y_pred_train) * 100

print(f"The model RandomForest Has a R2 score of {train_score : .4f} for train data and {test_score: .4f} for test data...")

The model RandomForest Has a R2 score of  97.5953 for train data and  88.7901 for test data...


In [131]:
X_test.iloc[971], y_test.iloc[971]

(airline                                SpiceJet
 date_of_journey                      15/04/2019
 source                                  Kolkata
 destination                            Banglore
 dep_time                                  22:20
 arrival_time                       00:40 16 Apr
 duration                                 2h 20m
 total_stops                            non-stop
 additional_info    No check-in baggage included
 Name: 971, dtype: object,
 np.int64(3841))

In [33]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import  GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


models = {

        'Linear Regression': LinearRegression(),
        'Lasso': Lasso(),
        'Ridge': Ridge(),
        'Decision Tree Regressor' : DecisionTreeRegressor(),
        # 'Random Forest Regressor' : RandomForestRegressor()
}


params = {

            'Linear Regression' : {},
            'Decision Tree Regressor' : {
                'criterion' : ['squared_error', 'absolute_error'],
            },
            'Ridge' : {
                'alpha' : [.0001, .001, .01, 1, 10]
            },
            'Lasso' : {
                'alpha' : [.0001, .001, .01, 1, 10]
            },
            'Random Forest Regressor' : {
                'criterion' : ['squared_error', 'absolute_error']
            }
}


def evaluate_models(models: dict[str, dict], params: dict[str, dict], X_train, X_test, y_train, y_test) : 

    reports = {}
    
    for estimator_name, estimator in models.items():
        est: BaseEstimator = estimator
        print(f"Model {estimator_name} is taken")
        grid = GridSearchCV(est, 
                            param_grid=params[estimator_name],
                            scoring='neg_mean_squared_error', 
                            verbose=3, cv=2)
        grid.fit(X_train, y_train)

        est.set_params(**grid.best_params_)

        est.fit(X_train, y_train)

        train_score = est.score(X_train, y_train)
        test_score  = est.score(X_test, y_test)

        print(f'{estimator_name} is train score is {round(train_score*100, 2)}')
        print(f'{estimator_name} is test score is {round(test_score*100, 2)}')

        print('\n')
        print('='*100)
        
        reports[estimator_name] = {'train_score':round(train_score*100, 6), 'test_score': round(test_score*100, 6), 
                                   'params': grid.best_params_}

    return reports

In [34]:
target_column = 'price'
X_train = train_df[train_df.columns[train_df.columns != target_column]]
y_train = train_df[target_column]

X_test = test_df[train_df.columns[train_df.columns != target_column]]
y_test = test_df[target_column]

reports = evaluate_models(models, params, X_train, X_test, y_train, y_test)

Model Linear Regression is taken
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END ..........................., score=-11849916.598 total time=   0.1s
[CV 2/2] END ..........................., score=-13325189.195 total time=   0.1s
Linear Regression is train score is 41.65
Linear Regression is test score is 45.27


Model Lasso is taken
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 1/2] END ..............alpha=0.0001;, score=-11849916.511 total time=   0.0s
[CV 2/2] END ..............alpha=0.0001;, score=-13325189.194 total time=   0.0s
[CV 1/2] END ...............alpha=0.001;, score=-11849915.731 total time=   0.0s
[CV 2/2] END ...............alpha=0.001;, score=-13325189.189 total time=   0.0s
[CV 1/2] END ................alpha=0.01;, score=-11849907.930 total time=   0.0s
[CV 2/2] END ................alpha=0.01;, score=-13325189.132 total time=   0.0s
[CV 1/2] END ...................alpha=1;, score=-11849058.450 total time=   0.0s
[CV 2/2] END 

In [2]:
from dataclasses import dataclass
from typing import Literal
import pandas as pd

@dataclass
class CustomDataObject:
    airline: Literal['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
        'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
        'Vistara Premium economy', 'Jet Airways Business',
        'Multiple carriers Premium economy', 'Trujet']

    source: Literal['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai']
    destination: Literal['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad']
    additional_info: Literal['No info', 'In-flight meal not included',
                'No check-in baggage included', '1 Short layover', 'No Info',
                '1 Long layover', 'Change airports', 'Business class',
                'Red-eye flight', '2 Long layover']
    total_stops: Literal['non-stop', '2 stops', '1 stop', '3 stops', '4 stops']
    date_of_journey: str
    duration: str 
    arrival_time: str
    dep_time: str


    def get_data_as_frame(self) -> pd.DataFrame:
        data = {
            'airline': [self.airline],
            'source': [self.source],
            'destination': [self.destination],
            'additional_info': [self.additional_info],
            'total_stops': [self.total_stops],
            'duration': [self.duration],
            'arrival_time': [self.arrival_time],
            'dep_time': [self.dep_time],
            'date_of_journey': [self.date_of_journey]
        }

        return pd.DataFrame(data)





In [38]:
import os
from pathlib import Path
import pickle

path = Path(os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'model', 'regressor.pkl'))
with open(path, 'rb') as file :
    reg = pickle.load(file)

path = Path(os.path.join(os.path.dirname(os.getcwd()), 'artifact', 'model', 'preprocessor.pkl'))
with open(path, 'rb') as file:
    preprocessor = pickle.load(file)

# print(preprocessor)
# print(reg)