In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os

In [3]:
dataframe = pd.read_csv("./datasets/flight_price_prediction.csv")
print('Data dimensions before cleaning data : ', dataframe.shape)
dataframe = dataframe.dropna()
dataframe = dataframe.drop(['Unnamed: 0','flight',],axis=1)
print('Data dimensions after cleaning data : ',dataframe.shape)

Data dimensions before cleaning data :  (300153, 12)
Data dimensions after cleaning data :  (300153, 10)


In [4]:
def normalize(dataframe):
    dataframe_normalized = dataframe.copy()
    for (column, type) in zip(dataframe.columns, dataframe.dtypes):
        if type == 'object':
            dataframe_normalized[column] = pd.Categorical(pd.factorize(dataframe[column])[0])
        else:
            if column != 'price':
                dataframe_normalized[column] = (dataframe[column] - min(dataframe[column]))/max(dataframe[column])
    return dataframe_normalized


In [5]:
dataframe_normalized = normalize(dataframe)

In [6]:

# adding bias to each row of the data in dataframe before spliting data into train and test data.
dataframe_normalized.insert(0,'bias',1)
dataframe_normalized

Unnamed: 0,bias,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,0,0,0,0,0,0,0,0.026891,0.000000,5953
1,1,0,0,1,0,1,0,0,0.030102,0.000000,5953
2,1,1,0,1,0,2,0,0,0.026891,0.000000,5956
3,1,2,0,2,0,3,0,0,0.028497,0.000000,5955
4,1,2,0,2,0,1,0,0,0.030102,0.000000,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,1,2,5,2,1,4,3,1,0.185631,0.979592,69265
300149,1,2,5,3,1,0,3,1,0.192454,0.979592,77105
300150,1,2,5,1,1,0,3,1,0.260887,0.979592,79099
300151,1,2,5,1,1,4,3,1,0.184026,0.979592,81585


In [7]:
'''
    sample : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html
    This method is used to return a random sample of items from an axis of object.

    This method shuffles the dataframe and then split dataframe into input data(with bias) and target
'''
def input_target_split(df,target):
    df = df.sample(frac=1)
    Y = df[target]
    X = df.drop([target], axis=1)
    return X, Y
X,Y = input_target_split(dataframe_normalized, 'price')

In [8]:
'''
    This method is used to split the both input and target into 80% train and 20% test data
'''
def train_test_split(X,Y, percent):
    size = len(X)
    train_size = int(percent * size)
    return X[0:train_size], Y[0:train_size], X[train_size:], Y[train_size:]


In [9]:
X_train, Y_train, X_test, Y_test = train_test_split(X,Y,0.8)

  return X[0:train_size], Y[0:train_size], X[train_size:], Y[train_size:]


In [10]:
'''
    np.dot -> https://numpy.org/doc/stable/reference/generated/numpy.dot.html
    np.linalg.inv -> https://numpy.org/doc/stable/reference/generated/numpy.linalg.inv.html
    np.identity -> https://numpy.org/doc/stable/reference/generated/numpy.identity.html
'''
class LinearRegression:
    lamda = 0.0
    __weights = np.zeros(0)
    predicted = np.zeros(0)

    def __init__(self,learning_rate,lamda1=0,lamda2=0,iterations=1000):
        self.lamda1 = lamda1
        self.lamda2 = lamda2
        self.num_iterations = iterations
        self.learning_rate = learning_rate
    def init(self):
       self.__weights = np.zeros(0)

    def setLamda(self,lamda):
        self.lamda = lamda

    def get_weights(self):
        return self.__weights

    def set_weights(self, weights):
        self.__weights = weights
    
    def gradient_descent(self,X, Y):
        h = self.predict(X)

        self.__weights = self.__weights - (self.learning_rate/len(Y)) * (2*np.dot(X.T, (h - Y)) + 2*self.lamda2*self.__weights + self.lamda1*np.sign(self.__weights))
        return self.__weights

    # This method is used to train the model based on the train data ( train input and corresponding target values)
    def fit(self,X_train,Y_train):
        loss_list = []
        columns_count = X_train.shape[1]
        self.__weights = np.zeros(columns_count)
        for i in range(self.num_iterations):
            self.__weights = self.gradient_descent(X_train, Y_train)
            if i == self.num_iterations/4 or i == self.num_iterations/2 or i == 3*self.num_iterations/4:
                print('learning rate : {0}, iteration : {1}'.format(self.learning_rate, i))
        return self.__weights
    
    # This method is used to predict the test data and returns the predicted values of test input as list
    def predict(self,X):
        x_test = X.values
        y_pred = [np.dot(self.__weights, x_row) for x_row in x_test]
        self.predicted = y_pred
        return self.predicted

    # This method is used to find the sum of squared errors based on the target values of test data and corresponding predicted data
    def sumSquaredErrors(self,X,Y):
        x_test = X.values

        error = 0.0
        y_pred = self.predicted
        for (target, pred) in zip(Y, y_pred):
            error = error + (target - pred)**2
        return float(error)/float(len(x_test))
        

In [11]:
def run_model_show_stats(model,X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    sserror = model.sumSquaredErrors(X_test, Y_test)
    print('Error : ', sserror)
    
    return y_pred, sserror


In [152]:
df_list = []
weights = np.zeros(0)
total_iter = 0
for iterations in [100,500,1000,2000,3000,5000]:
    model = LinearRegression(0.001,0.1,0.1,iterations)
    model.set_weights(weights)
    y_pred,error = run_model_show_stats(model, X_train, Y_train, X_test, Y_test)
    total_iter = total_iter + iterations
    df_list.append({
        'iterations' : total_iter,
        'error' : error
    })
    weights = model.get_weights()
df_models = pd.DataFrame.from_records(df_list)
df_models

learning rate : 0.001, iteration : 25
learning rate : 0.001, iteration : 50
learning rate : 0.001, iteration : 75
Error :  505324658.9546646
learning rate : 0.001, iteration : 125
learning rate : 0.001, iteration : 250
learning rate : 0.001, iteration : 375
Error :  361117660.0002689
learning rate : 0.001, iteration : 250
learning rate : 0.001, iteration : 500
learning rate : 0.001, iteration : 750
Error :  245291876.69570526
learning rate : 0.001, iteration : 500
learning rate : 0.001, iteration : 1000
learning rate : 0.001, iteration : 1500
Error :  128905005.74263535
learning rate : 0.001, iteration : 750
learning rate : 0.001, iteration : 1500
learning rate : 0.001, iteration : 2250
Error :  82991637.96608138
learning rate : 0.001, iteration : 1250
learning rate : 0.001, iteration : 2500
learning rate : 0.001, iteration : 3750
Error :  56448923.355530076


Unnamed: 0,iterations,error
0,100,505324700.0
1,600,361117700.0
2,1600,245291900.0
3,3600,128905000.0
4,6600,82991640.0
5,11600,56448920.0
