In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv("data/Flight_Price.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [3]:
# drop unnamed column as its not of any use 
df.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
# seperate numerical and categorical column
categorical_col = df.columns[df.dtypes == 'object']
print("categorical Columns: ",categorical_col)

numerical_col = df.columns[df.dtypes != 'object']
print("Numerical Columns: ",numerical_col)

categorical Columns:  Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')
Numerical Columns:  Index(['duration', 'days_left', 'price'], dtype='object')


In [5]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


### Using Label Encoding

In [79]:
le = LabelEncoder()

# copy dataframe
df_copy = df.copy()

# Apply label encoder to specific columns
df_copy[categorical_col] = df_copy[categorical_col].apply(le.fit_transform)

df_copy.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,1408,2,2,2,5,5,1,2.17,1,5953
1,4,1387,2,1,2,4,5,1,2.33,1,5953
2,0,1213,2,1,2,1,5,1,2.17,1,5956
3,5,1559,2,4,2,0,5,1,2.25,1,5955
4,5,1549,2,4,2,4,5,1,2.33,1,5955


In [81]:
df_copy.airline.unique()

array([4, 0, 5, 2, 3, 1])

In [45]:
categorical_col

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class'],
      dtype='object')

### Using One Hot Encoding

In [77]:
onehot = OneHotEncoder()

# copy dataframe
df_copy = df.copy()

In [78]:
df_copy['airline'].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [65]:
type(df_copy['airline'])

pandas.core.series.Series

In [66]:
type(df_copy[['airline']])

pandas.core.frame.DataFrame

In [67]:
type(df_copy)

pandas.core.frame.DataFrame

In [73]:
col = 'airline'
# Apply label encoder to specific columns
# df_copy[col] = onehot.fit_transform(df_copy[col])
df_copy[col] = onehot.fit_transform(df_copy[[col]]).toarray()

In [76]:
df.airline.unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [75]:
df_copy[col].unique()

array([0., 1.])

Findings: Here One hot Encoding will not work 

In [14]:
X = df_copy.iloc[:, :-1]
y = df_copy.iloc[:,-1]

In [15]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [16]:
X_train.shape

(210107, 10)

In [17]:
X_test.shape

(90046, 10)

In [18]:
y_train.shape

(210107,)

In [19]:
y_test.shape

(90046,)

In [22]:
models = {
                'LinearRegression': LinearRegression(),
                'Lasso': Lasso(),
                'Ridge': Ridge(),
                'ElasticNet':ElasticNet()
            }


In [23]:
report = {}

for i in range(len(models)):
    model = list(models.values())[i]

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    test_model_score = r2_score(y_test, y_pred)

    report[list(models.keys())[i]] = test_model_score

In [24]:
best_model_score = max(sorted(report.values()))

best_model_name = list(report.keys())[list(report.values()).index(best_model_score)]

best_model = models[best_model_name]

print(f"Best model found, Model Name: {best_model_name}, R2_Score: {best_model_score}")

Best model found, Model Name: Lasso, R2_Score: 0.9045042337349931


In [25]:
report

{'LinearRegression': 0.9045041573995353,
 'Lasso': 0.9045042337349931,
 'Ridge': 0.9045041498838425,
 'ElasticNet': 0.5087415138554229}

Trainig Model issue

In [103]:
df = pd.read_csv("data/Flight_Price.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [136]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_col = ['airline', 'flight', 'source_city', 'departure_time', 'stops','arrival_time', 'destination_city', 'class']
    numericla_col = ['duration', 'days_left']
    # Numerical Pipeline
    num_pipeline = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
                ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('label_encoding', LabelEncoder())
        ]   
    )

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numericla_col),
        ('cat_pipeline', cat_pipeline, categorical_col)
    ])

    return preprocess

In [137]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

In [138]:
preprocess_obj = get_data_transformation()

In [139]:
target_col_name = 'price'
drop_col = [target_col_name, "Unnamed: 0"]

train_input_feature = train_df.drop(drop_col, axis=1)
target_train_feature = train_df[target_col_name]

test_input_feature = test_df.drop(drop_col, axis=1)
target_test_feature = test_df[target_col_name]

In [140]:
lb = LabelEncoder()
categorical_col = ['airline', 'flight', 'source_city', 'departure_time', 'stops','arrival_time', 'destination_city', 'class']
# lb.fit_transform(train_input_feature)
train_input_feature[categorical_col] = train_input_feature[categorical_col].apply(lb.fit_transform)

In [141]:
train_input_feature_arr = preprocess_obj.fit_transform(train_input_feature)
# test_input_feature_arr = preprocess_obj.transform(test_input_feature)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [142]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']
    
    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label_encoding', LabelEncoder())
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Example usage:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

preprocess_obj = get_data_transformation()

target_col_name = 'price'
drop_cols = [target_col_name, "Unnamed: 0"]

train_input_features = train_df.drop(drop_cols, axis=1)
target_train_feature = train_df[target_col_name]

test_input_features = test_df.drop(drop_cols, axis=1)
target_test_feature = test_df[target_col_name]

train_input_feature_arr = preprocess_obj.fit_transform(train_input_features)


TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [148]:
df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

In [149]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [150]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']
    
    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot_encoding', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Example usage:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

preprocess_obj = get_data_transformation()

target_col_name = 'price'
drop_cols = [target_col_name, "Unnamed: 0"]

train_input_features = train_df.drop(drop_cols, axis=1)
target_train_feature = train_df[target_col_name]

test_input_features = test_df.drop(drop_cols, axis=1)
target_test_feature = test_df[target_col_name]

# Corrected fit_transform call
train_input_feature_arr = preprocess_obj.fit_transform(train_input_features)


In [154]:
train_input_feature_arr.shape

(210107, 1585)

In [152]:
target_arr = np.c_[train_input_feature_arr, np.array(target_train_feature)]
# test_arr = np.c_[test_input_feature_arr, np.array(target_test_feature)]

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 210107

In [155]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def get_data_transformation():
    categorical_cols = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
    numerical_cols = ['duration', 'days_left']

    # Numerical Pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical Pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocess = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ])

    return preprocess

# Sample DataFrame (replace this with your actual data)
df = pd.read_csv('https://raw.githubusercontent.com/Kashyap-08/Flight_Price_Prediction/master/notebooks/data/Flight_Price.csv')

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=23)

# Get the preprocessing pipeline
preprocess_obj = get_data_transformation()

# Separate input features and target variable for train set
X_train = train_df.drop('price', axis=1)
y_train = train_df['price']

# Separate input features and target variable for test set
X_test = test_df.drop('price', axis=1)
y_test = test_df['price']

# Fit-transform the preprocessing pipeline on the training data
X_train_transformed = preprocess_obj.fit_transform(X_train)

# Transform the test data using the fitted preprocessing pipeline
X_test_transformed = preprocess_obj.transform(X_test)

# Concatenate the transformed features with the target variable for training data
target_arr_train = np.c_[X_train_transformed, y_train]

# Concatenate the transformed features with the target variable for test data
target_arr_test = np.c_[X_test_transformed, y_test]


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 210107

In [None]:
# def initialize_data_transformation(self, train_path, test_path):
#     train_df = pd.read_csv(train_path)
#     test_df = pd.read_csv(test_path)

#     preprocess_obj = self.get_data_transformation()

#     target_col_name = 'price'
#     drop_col = [target_col_name, "Unnamed: 0"]

#     train_input_feature = train_df.drop(drop_col, axis=1)
#     target_train_feature = train_df[target_col_name]

#     test_input_feature = test_df.drop(drop_col, axis=1)
#     target_test_feature = test_df[target_col_name]

#     train_input_feature_arr = preprocess_obj.fit_transform(train_input_feature)
#     test_input_feature_arr = preprocess_obj.transform(test_input_feature)

#     logging.info("Applying Preprocess on Train and Test dataset")

#     save_object(
#         file_path=self.data_transformation_config.preprocessor_obj_file_path,
#         obj = preprocess_obj
#     )

#     logging.info("Preprocessor Pickel file stored")

#     # Use numpy.c_ to concatenate them horizontally
#     logging.info("Concate the transform data and target data")
#     target_arr = np.c_[train_input_feature_arr, np.array(target_train_feature)]
#     test_arr = np.c_[test_input_feature_arr, np.array(target_test_feature)]

#     logging.info("Retunr created train test arrays")
#     return (
#         target_arr,
#         test_arr
#     )

#     # return(
#     #     train_input_feature_arr, 
#     #     np.array(target_train_feature),
#     #     test_input_feature_arr,
#     #     np.array(target_test_feature)
#     # )