In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [85]:
df = pd.read_csv("data/Flight_Price.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [86]:
# drop unnamed column as its not of any use 
df.drop("Unnamed: 0", axis=1, inplace=True)

In [87]:
# seperate numerical and categorical column
categorical_col = df.columns[df.dtypes == 'object']
print("categorical Columns: ",categorical_col)

numerical_col = df.columns[df.dtypes != 'object']
print("Numerical Columns: ",numerical_col)

categorical Columns:  Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')
Numerical Columns:  Index(['duration', 'days_left', 'price'], dtype='object')


In [88]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


### Using Label Encoding

In [89]:
le = LabelEncoder()

# copy dataframe
df_copy = df.copy()

# Apply label encoder to specific columns
df_copy[categorical_col] = df_copy[categorical_col].apply(le.fit_transform)

df_copy.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,1408,2,2,2,5,5,1,2.17,1,5953
1,4,1387,2,1,2,4,5,1,2.33,1,5953
2,0,1213,2,1,2,1,5,1,2.17,1,5956
3,5,1559,2,4,2,0,5,1,2.25,1,5955
4,5,1549,2,4,2,4,5,1,2.33,1,5955


In [90]:
df_copy.airline.unique()

array([4, 0, 5, 2, 3, 1])

In [91]:
categorical_col

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')

### Using One Hot Encoding

In [108]:
df.shape

(300153, 11)

In [109]:
df_sample = df.iloc[:10000,:]
df_sample.shape

(10000, 11)

In [111]:
categorical_col

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')

In [97]:
# train test split 

X = df_sample.iloc[:,:-1]
y = df_sample.iloc[:,-1]
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [98]:
X_train.shape

(7000, 10)

In [99]:
y_train.shape

(7000,)

In [100]:
# onehot = OneHotEncoder(handle_unknown='ignore')

# X_train_arr = onehot.fit_transform(X_train)
# X_test_arr = onehot.transform(X_test)

In [107]:
X_train_arr

<7000x570 sparse matrix of type '<class 'numpy.float64'>'
	with 70000 stored elements in Compressed Sparse Row format>

In [102]:
from sklearn.svm import SVR

model = SVR()

model.fit(X_train_arr, y_train)

y_pred = model.predict(X_test_arr)

test_model_score = r2_score(y_test, y_pred)

test_model_score    

-0.01711444912180804

In [103]:
# find the accuracy 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2_value = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2_value}')

Mean Absolute Error: 2570.21020075612
Mean Squared Error: 14401689.088822922
Root Mean Squared Error: 3794.9557426698566
R-squared: -0.01711444912180804




In [71]:
# 5953
# AirAsia,I5-744,Delhi,Morning,one,Afternoon,Mumbai,Economy,5.83,1,8869

def create_df(data):
    split_data = data.split(',')
    custom_data_input_dict = {
        'airline':[split_data[0]], 
        'flight':[split_data[1]], 
        'source_city':[split_data[2]], 
        'departure_time':[split_data[3]], 
        'stops':[split_data[4]], 
        'arrival_time':[split_data[5]], 
        'destination_city':[split_data[6]], 
        'class':[split_data[7]], 
        'duration':[split_data[8]], 
        'days_left':[split_data[9]]
    }

    df = pd.DataFrame(custom_data_input_dict)
    return df

In [77]:
# data = "AirAsia,I5-744,Delhi,Morning,one,Afternoon,Mumbai,Economy,5.83,1,8869"
# data = 'Vistara,UK-817,Delhi,Evening,one,Morning,Mumbai,Economy,16.17,1'
# data = 'Indigo,6E-2373,Delhi,Afternoon,one,Evening,Mumbai,Economy,6.0,1'
data = 'Air_India,AI-453,Delhi,Early_Morning,one,Afternoon,Mumbai,Economy,8.83,1'
# 11900
df = create_df(data)
df

scaled_df = onehot.transform(df)
model.predict(scaled_df)


In [104]:
models = {
                'LinearRegression': LinearRegression(),
                'Lasso': Lasso(),
                'Ridge': Ridge(),
                'ElasticNet':ElasticNet(),
                'SVR': SVR()
            }


In [105]:
report = {}

for i in range(len(models)):
    model = list(models.values())[i]

    model.fit(X_train_arr, y_train)

    y_pred = model.predict(X_test_arr)

    test_model_score = r2_score(y_test, y_pred)

    report[list(models.keys())[i]] = test_model_score

In [106]:
best_model_score = max(sorted(report.values()))

best_model_name = list(report.keys())[list(report.values()).index(best_model_score)]

best_model = models[best_model_name]

print(f"Best model found, Model Name: {best_model_name}, R2_Score: {best_model_score}")

Best model found, Model Name: Ridge, R2_Score: 0.7774527798222507


array([2864.48916289])

array([29801.10872196])

In [33]:
categorical_col

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')

In [22]:
col = 'airline'
# Apply label encoder to specific columns
df_copy[categorical_col] = onehot.fit_transform(df_copy[col])
df_copy[col] = onehot.fit_transform(df_copy[[col]]).toarray()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [24]:
df_copy[col].unique()

array([0., 1.])

Findings: Here One hot Encoding will not work 

In [25]:
X = df_copy.iloc[:, :-1]
y = df_copy.iloc[:,-1]

In [26]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [27]:
X_train.shape

(210107, 10)

In [28]:
X_test.shape

(90046, 10)

In [29]:
y_train.shape

(210107,)

In [30]:
y_test.shape

(90046,)

In [31]:
models = {
                'LinearRegression': LinearRegression(),
                'Lasso': Lasso(),
                'Ridge': Ridge(),
                'ElasticNet':ElasticNet()
            }


In [32]:
report = {}

for i in range(len(models)):
    model = list(models.values())[i]

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    test_model_score = r2_score(y_test, y_pred)

    report[list(models.keys())[i]] = test_model_score

ValueError: could not convert string to float: 'AI-809'

In [24]:
best_model_score = max(sorted(report.values()))

best_model_name = list(report.keys())[list(report.values()).index(best_model_score)]

best_model = models[best_model_name]

print(f"Best model found, Model Name: {best_model_name}, R2_Score: {best_model_score}")

Best model found, Model Name: Lasso, R2_Score: 0.9045042337349931


In [25]:
report

{'LinearRegression': 0.9045041573995353,
 'Lasso': 0.9045042337349931,
 'Ridge': 0.9045041498838425,
 'ElasticNet': 0.5087415138554229}

In [8]:
report

NameError: name 'report' is not defined

Trainig Model issue

In [103]:
df = pd.read_csv("data/Flight_Price.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [136]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_col = ['airline', 'flight', 'source_city', 'departure_time', 'stops','arrival_time', 'destination_city', 'class']
    numericla_col = ['duration', 'days_left']
    # Numerical Pipeline
    num_pipeline = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
                ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('label_encoding', LabelEncoder())
        ]   
    )

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numericla_col),
        ('cat_pipeline', cat_pipeline, categorical_col)
    ])

    return preprocess

In [137]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

In [138]:
preprocess_obj = get_data_transformation()

In [139]:
target_col_name = 'price'
drop_col = [target_col_name, "Unnamed: 0"]

train_input_feature = train_df.drop(drop_col, axis=1)
target_train_feature = train_df[target_col_name]

test_input_feature = test_df.drop(drop_col, axis=1)
target_test_feature = test_df[target_col_name]

In [140]:
lb = LabelEncoder()
categorical_col = ['airline', 'flight', 'source_city', 'departure_time', 'stops','arrival_time', 'destination_city', 'class']
# lb.fit_transform(train_input_feature)
train_input_feature[categorical_col] = train_input_feature[categorical_col].apply(lb.fit_transform)

In [141]:
train_input_feature_arr = preprocess_obj.fit_transform(train_input_feature)
test_input_feature_arr = preprocess_obj.transform(test_input_feature)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [142]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']
    
    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label_encoding', LabelEncoder())
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Example usage:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

preprocess_obj = get_data_transformation()

target_col_name = 'price'
drop_cols = [target_col_name, "Unnamed: 0"]

train_input_features = train_df.drop(drop_cols, axis=1)
target_train_feature = train_df[target_col_name]

test_input_features = test_df.drop(drop_cols, axis=1)
target_test_feature = test_df[target_col_name]

train_input_feature_arr = preprocess_obj.fit_transform(train_input_features)


TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [148]:
df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

In [149]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [150]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']
    
    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot_encoding', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Example usage:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

preprocess_obj = get_data_transformation()

target_col_name = 'price'
drop_cols = [target_col_name, "Unnamed: 0"]

train_input_features = train_df.drop(drop_cols, axis=1)
target_train_feature = train_df[target_col_name]

test_input_features = test_df.drop(drop_cols, axis=1)
target_test_feature = test_df[target_col_name]

# Corrected fit_transform call
train_input_feature_arr = preprocess_obj.fit_transform(train_input_features)


In [154]:
train_input_feature_arr.shape

(210107, 1585)

In [152]:
target_arr = np.c_[train_input_feature_arr, np.array(target_train_feature)]
# test_arr = np.c_[test_input_feature_arr, np.array(target_test_feature)]

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 210107

In [155]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']

    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Sample DataFrame (replace this with your actual data)
df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

# Get the preprocessing pipeline
preprocess_obj = get_data_transformation()

# Separate input features and target variable for train set
X_train = train_df.drop('price', axis=1)
y_train = train_df['price']

# Separate input features and target variable for test set
X_test = test_df.drop('price', axis=1)
y_test = test_df['price']

# Fit-transform the preprocessing pipeline on the training data
X_train_transformed = preprocess_obj.fit_transform(X_train)

# Transform the test data using the fitted preprocessing pipeline
X_test_transformed = preprocess_obj.transform(X_test)

# Concatenate the transformed features with the target variable for training data
target_arr_train = np.c_[X_train_transformed, y_train]

# Concatenate the transformed features with the target variable for test data
target_arr_test = np.c_[X_test_transformed, y_test]


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 210107

In [None]:
# def initialize_data_transformation(self, train_path, test_path):
#     train_df = pd.read_csv(train_path)
#     test_df = pd.read_csv(test_path)

#     preprocess_obj = self.get_data_transformation()

#     target_col_name = 'price'
#     drop_col = [target_col_name, "Unnamed: 0"]

#     train_input_feature = train_df.drop(drop_col, axis=1)
#     target_train_feature = train_df[target_col_name]

#     test_input_feature = test_df.drop(drop_col, axis=1)
#     target_test_feature = test_df[target_col_name]

#     train_input_feature_arr = preprocess_obj.fit_transform(train_input_feature)
#     test_input_feature_arr = preprocess_obj.transform(test_input_feature)

#     logging.info("Applying Preprocess on Train and Test dataset")

#     save_object(
#         file_path=self.data_transformation_config.preprocessor_obj_file_path,
#         obj = preprocess_obj
#     )

#     logging.info("Preprocessor Pickel file stored")

#     # Use numpy.c_ to concatenate them horizontally
#     logging.info("Concate the transform data and target data")
#     target_arr = np.c_[train_input_feature_arr, np.array(target_train_feature)]
#     test_arr = np.c_[test_input_feature_arr, np.array(target_test_feature)]

#     logging.info("Retunr created train test arrays")
#     return (
#         target_arr,
#         test_arr
#     )

#     # return(
#     #     train_input_feature_arr, 
#     #     np.array(target_train_feature),
#     #     test_input_feature_arr,
#     #     np.array(target_test_feature)
#     # )

In [158]:
class CustomData:
    def _init_(self, airline, flight, source_city, departure_time, stops, arrival_time, destination_city, flight_class, duration, days_left):
        self.airline = airline
        self.flight=flight
        self.source_city=source_city
        self.departure_time=departure_time
        self.stops=stops
        self.arrival_time=arrival_time
        self.destination_city=destination_city
        self.flight_class=flight_class
        self.duration=duration
        self.days_left= days_left

    def get_data_as_dataframe(self):
        custom_data_input_dict = {
            'airline':[self.airline], 
            'flight':[self.flight], 
            'source_city':[self.source_city], 
            'departure_time':[self.departure_time], 
            'stops':[self.stops], 
            'arrival_time':[self.arrival_time], 
            'destination_city':[self.destination_city], 
            'class':[self.flight_class], 
            'duration':[self.duration], 
            'days_left':[self.days_left]
        }

        df = pd.DataFrame(custom_data_input_dict)

        return df
    
customclass = CustomData('SpiceJet','SG-8709','Delhi','Evening','zero','Night','Mumbai','Economy',2.17,1,5953)


TypeError: CustomData() takes no arguments

In [160]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [161]:
df['airline'].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [162]:
df.flight.unique()

array(['SG-8709', 'SG-8157', 'I5-764', ..., '6E-7127', '6E-7259',
       'AI-433'], dtype=object)

In [163]:
df.departure_time.unique()

array(['Evening', 'Early_Morning', 'Morning', 'Afternoon', 'Night',
       'Late_Night'], dtype=object)

In [165]:
df['class'].unique()

array(['Economy', 'Business'], dtype=object)

In [166]:
df.stops.unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [167]:
df.days_left.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
      dtype=int64)

In [2]:
from datetime import datetime

In [3]:
travel_date = '2024-05-12'
travel_date = datetime.strptime(travel_date, '%Y-%m-%d')
current_date = datetime.now()
difference = travel_date - current_date
days_left = difference.days
print(days_left)

79


In [7]:
a = 123.5
a = round(a, 0)
a

124.0