In [87]:
import pandas as pd 
from linear_regression import LinearRegressionScratch
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


In [88]:
df=pd.read_csv('dynamic_pricing.csv')
df.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.3+ KB


In [90]:
df.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


In [91]:
cat_col=[c for c in df.columns if df[c].dtype=='O']
lab_col=['Vehicle_Type']
print(cat_col)

['Location_Category', 'Customer_Loyalty_Status', 'Time_of_Booking', 'Vehicle_Type']


In [92]:
numerical_col=[c for c in df.columns if df[c].dtype!='O' and c not in ['Historical_Cost_of_Ride']]
print(numerical_col)

['Number_of_Riders', 'Number_of_Drivers', 'Number_of_Past_Rides', 'Average_Ratings', 'Expected_Ride_Duration']


In [93]:
categorical_transformer = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(),cat_col),
        ('num',StandardScaler(),numerical_col)
    ]
)


In [94]:
numerical_col=[c for c in df.columns if df[c].dtype!='O' and c not in ['Historical_Cost_of_Ride']]
print(numerical_col)

['Number_of_Riders', 'Number_of_Drivers', 'Number_of_Past_Rides', 'Average_Ratings', 'Expected_Ride_Duration']


In [95]:
lr_model=Pipeline(steps=[
    ('preprocessor',categorical_transformer),
    ('model',LinearRegressionScratch())
])

In [96]:
sk_model=Pipeline(steps=[
    ('preprocessor',categorical_transformer),
    ('model',LinearRegression())
]
)

In [97]:
x=df.drop(['Historical_Cost_of_Ride'],axis=1)
y=df['Historical_Cost_of_Ride']

In [98]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [99]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(800, 9) (800,)
(200, 9) (200,)


In [100]:
lrmodel=lr_model.fit(x_train,y_train)
skmodel=sk_model.fit(x_train,y_train)

In [101]:
y_pred_lr=lrmodel.predict(x_test)
y_pred_sk=skmodel.predict(x_test)


In [102]:
from sklearn.metrics import mean_squared_error
error_lr=mean_squared_error(y_test,y_pred_lr)
error_sk=mean_squared_error(y_test,y_pred_sk)
print(f'Mean Squared Error: {error_lr}')
print(f'Mean Squared Error: {error_sk}')

Mean Squared Error: 4548.802784082683
Mean Squared Error: 4566.354559655161


In [103]:
help(LinearRegressionScratch)

Help on class LinearRegressionScratch in module linear_regression:

class LinearRegressionScratch(builtins.object)
 |  A simple linear regression model that uses gradient descent for optimization.
 |  weights -> The weights of the model.
 |  bias    -> The bias of the model.
 |  sample  -> The number of rows in the dataset.
 |  feature ->The number of columns in the dataset.
 |  dw      -> The gradient of the loss function with respect to the weights.
 |  db      -> The gradient of the loss function with respect to the bias.
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, x, y, learning_rate=0.01, epochs=1500)
 |  
 |  predict(self, x)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if 