In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
dataFrame = pd.read_csv('taxi_trip_pricing.csv')

In [4]:
for column in dataFrame.columns:
  print(column)

Trip_Distance_km
Time_of_Day
Day_of_Week
Passenger_Count
Traffic_Conditions
Weather
Base_Fare
Per_Km_Rate
Per_Minute_Rate
Trip_Duration_Minutes
Trip_Price


In [5]:
dataFrame.drop(['Time_of_Day', 'Day_of_Week','Weather','Base_Fare','Per_Km_Rate','Per_Minute_Rate'] , axis = 1 , inplace = True)

In [6]:
for column in dataFrame.columns:
  print(column)

Trip_Distance_km
Passenger_Count
Traffic_Conditions
Trip_Duration_Minutes
Trip_Price


In [7]:
dataFrame.head(10)

Unnamed: 0,Trip_Distance_km,Passenger_Count,Traffic_Conditions,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,Low,53.82,36.2624
1,47.59,1.0,High,40.57,
2,36.87,1.0,High,37.27,52.9032
3,30.33,4.0,Low,116.81,36.4698
4,,3.0,High,22.64,15.618
5,8.64,2.0,Medium,89.33,60.2028
6,3.85,4.0,High,5.05,11.2645
7,43.44,3.0,,,101.1216
8,30.45,3.0,High,110.33,
9,35.7,2.0,Low,,75.5657


In [8]:
Traffic_Conditions_mapper = {"Low": 0 , "Medium" : 1 , "High" : 2}
dataFrame['Traffic_Conditions'] = dataFrame['Traffic_Conditions'].replace(Traffic_Conditions_mapper)
dataFrame.head(10)

  dataFrame['Traffic_Conditions'] = dataFrame['Traffic_Conditions'].replace(Traffic_Conditions_mapper)


Unnamed: 0,Trip_Distance_km,Passenger_Count,Traffic_Conditions,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,0.0,53.82,36.2624
1,47.59,1.0,2.0,40.57,
2,36.87,1.0,2.0,37.27,52.9032
3,30.33,4.0,0.0,116.81,36.4698
4,,3.0,2.0,22.64,15.618
5,8.64,2.0,1.0,89.33,60.2028
6,3.85,4.0,2.0,5.05,11.2645
7,43.44,3.0,,,101.1216
8,30.45,3.0,2.0,110.33,
9,35.7,2.0,0.0,,75.5657


In [9]:
dataFrame.isnull().sum()

Trip_Distance_km         50
Passenger_Count          50
Traffic_Conditions       50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

In [10]:
dataFrame['Trip_Distance_km'] = dataFrame['Trip_Distance_km'].fillna(dataFrame['Trip_Distance_km'].mean())
dataFrame['Trip_Duration_Minutes'] = dataFrame['Trip_Duration_Minutes'].fillna(dataFrame['Trip_Duration_Minutes'].mean())
dataFrame['Trip_Price'] = dataFrame['Trip_Price'].fillna(dataFrame['Trip_Price'].mean())
dataFrame['Traffic_Conditions'] = dataFrame['Traffic_Conditions'].apply(
    lambda x: np.random.choice([0, 1, 2]) if np.isnan(x) else x
)
dataFrame['Passenger_Count'] = dataFrame['Passenger_Count'].apply(
    lambda x: np.random.choice([1, 2, 3, 4]) if np.isnan(x) else x
)


In [11]:
dataFrame.isnull().sum() 

Trip_Distance_km         0
Passenger_Count          0
Traffic_Conditions       0
Trip_Duration_Minutes    0
Trip_Price               0
dtype: int64

In [12]:
dataFrame.head(20)

Unnamed: 0,Trip_Distance_km,Passenger_Count,Traffic_Conditions,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,0.0,53.82,36.2624
1,47.59,1.0,2.0,40.57,56.874773
2,36.87,1.0,2.0,37.27,52.9032
3,30.33,4.0,0.0,116.81,36.4698
4,27.070547,3.0,2.0,22.64,15.618
5,8.64,2.0,1.0,89.33,60.2028
6,3.85,4.0,2.0,5.05,11.2645
7,43.44,3.0,1.0,62.118116,101.1216
8,30.45,3.0,2.0,110.33,56.874773
9,35.7,2.0,0.0,62.118116,75.5657


In [13]:
numerical_columns = dataFrame.select_dtypes(include=["int64", "float64"]).columns
for column in numerical_columns:
    correlation = dataFrame[column].corr(dataFrame['Trip_Price'])
    print(f"Correlation between {column:<25} and Trip_Price: {correlation:<20}")

Correlation between Trip_Distance_km          and Trip_Price: 0.8296848519933567  
Correlation between Passenger_Count           and Trip_Price: -0.011938176985961182
Correlation between Traffic_Conditions        and Trip_Price: 0.06336581654463981 
Correlation between Trip_Duration_Minutes     and Trip_Price: 0.21514407367822885 
Correlation between Trip_Price                and Trip_Price: 1.0                 


In [14]:
dataFrame['Traffic_Conditions'].value_counts()


Traffic_Conditions
0.0    415
1.0    386
2.0    199
Name: count, dtype: int64

In [15]:
dataFrame['Passenger_Count'].value_counts()

Passenger_Count
3.0    262
1.0    256
2.0    249
4.0    233
Name: count, dtype: int64

In [16]:
dataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Passenger_Count        1000 non-null   float64
 2   Traffic_Conditions     1000 non-null   float64
 3   Trip_Duration_Minutes  1000 non-null   float64
 4   Trip_Price             1000 non-null   float64
dtypes: float64(5)
memory usage: 39.2 KB


In [17]:
features = dataFrame.iloc[:,:-1]
dependentVariable = dataFrame.iloc[:,-1]

In [18]:
standardScalar = StandardScaler()
features = standardScalar.fit_transform(features)

In [19]:
featuresTrain, featuresTest, dependentVariableTrain, dependentVariableTest = train_test_split(features, dependentVariable , test_size=0.2, random_state=100)

In [20]:
featuresTrain

array([[ 1.13812442,  0.47662127,  1.614397  , -1.08952878],
       [ 1.03137427,  1.37931306,  0.28676789,  0.2848282 ],
       [ 0.49556068,  0.47662127,  1.614397  ,  1.40889577],
       ...,
       [ 5.13004537, -1.32876232, -1.04086122, -1.04770747],
       [-0.06603796, -0.42607053,  0.28676789,  0.15297931],
       [ 0.15674497,  0.47662127,  0.28676789, -0.8229578 ]])

In [21]:
featuresTest

array([[ 8.11169840e-01, -4.26070528e-01,  1.61439700e+00,
        -1.45251226e+00],
       [-9.32931735e-01, -4.26070528e-01,  1.61439700e+00,
         7.09107040e-01],
       [-7.19431424e-01,  4.76621268e-01,  2.86767888e-01,
         1.28279333e+00],
       [ 1.83213883e-16, -1.32876232e+00,  1.61439700e+00,
        -1.24532117e+00],
       [-4.58486599e-01, -4.26070528e-01,  1.61439700e+00,
         2.81538611e-02],
       [-7.68423041e-01,  1.37931306e+00, -1.04086122e+00,
        -1.74845394e+00],
       [-6.43623342e-01, -4.26070528e-01,  2.86767888e-01,
        -5.73945387e-01],
       [ 7.32013736e-02, -4.26070528e-01,  2.86767888e-01,
        -6.66846172e-01],
       [-1.07526528e+00,  1.37931306e+00,  2.86767888e-01,
         4.31362424e-01],
       [-6.56515873e-01, -1.32876232e+00, -1.04086122e+00,
         2.26838413e-16],
       [-2.82017285e-05, -1.32876232e+00,  2.86767888e-01,
        -8.37643149e-01],
       [ 7.94667401e-01,  1.37931306e+00,  1.61439700e+00,
      

In [22]:
dependentVariableTrain

675     69.197200
358    100.266200
159     76.619200
533     55.759800
678     50.731200
          ...    
855     43.862500
871     56.874773
835    224.914663
792     30.579700
520     54.198300
Name: Trip_Price, Length: 800, dtype: float64

In [23]:
dependentVariableTest

249     57.551200
353     38.139200
537     64.982400
424     19.257200
564     46.316000
          ...    
684     60.026400
644     60.315000
110    274.535087
28      64.797100
804     26.269200
Name: Trip_Price, Length: 200, dtype: float64

In [24]:
model = LinearRegression()
model.fit(featuresTrain, dependentVariableTrain)
predections = model.predict(featuresTest)

In [25]:
rSquaredScore = r2_score(dependentVariableTest, predections)
print("The accuracy is {}%".format(round(rSquaredScore, 2) * 100))



The accuracy is 69.0%
