# Regression

By Ishan Sharma

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
taxi_trip_df = pd.read_csv("../data/taxi_trip_pricing.csv")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Before I proceed, I shall have to drop the nans for the target column, as I cannot target something if it is nan

In [4]:
taxi_trip_df.dropna(subset=['Trip_Price'], inplace=True)

In [5]:
X = pd.get_dummies(taxi_trip_df[["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather"]], dtype=int)

In [6]:
X["Base_Fare"] = taxi_trip_df["Base_Fare"].fillna(value=taxi_trip_df["Base_Fare"].median())

In [7]:
X["Per_Km_Rate"] = taxi_trip_df["Per_Km_Rate"].fillna(value=taxi_trip_df["Per_Km_Rate"].median())

In [8]:
X["Per_Minute_Rate"] = taxi_trip_df["Per_Minute_Rate"].fillna(value=taxi_trip_df["Per_Minute_Rate"].median())

In [9]:
X["Trip_Duration_Minutes"] = taxi_trip_df["Trip_Duration_Minutes"].fillna(value=taxi_trip_df["Trip_Duration_Minutes"].median())

In [10]:
X["Trip_Distance_km"] = taxi_trip_df["Trip_Distance_km"].fillna(value=taxi_trip_df["Trip_Distance_km"].median())

In [11]:
y = taxi_trip_df["Trip_Price"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1337)
LR = LinearRegression()
LR.fit(X_train, y_train)
train_score = LR.score(X_train, y_train)
test_score = LR.score(X_test, y_test)
print(f'Train score {train_score}, test score {test_score}')

Train score 0.8757081513259299, test score 0.843433621202043


This is a pretty good model. 

In [19]:
pd.concat([pd.DataFrame(X.columns), pd.DataFrame(LR.coef_)], axis=1)

Unnamed: 0,0,0.1
0,Time_of_Day_Afternoon,-1.886811
1,Time_of_Day_Evening,-2.977867
2,Time_of_Day_Morning,-2.236602
3,Time_of_Day_Night,-3.265308
4,Day_of_Week_Weekday,-1.787519
5,Day_of_Week_Weekend,-1.861816
6,Traffic_Conditions_High,-2.341083
7,Traffic_Conditions_Low,-2.255336
8,Traffic_Conditions_Medium,-2.442432
9,Weather_Clear,1.265286


The model is: 

$y = -1.886811 y_{TimeOfDayAfternoon} - 2.977867 y_{TimeOfDayEvening} - 2.236602 y_{TimeOfDayMorning} - 3.265308 y_{TimeOfDayNight} - 1.787519 y_{DayOfWeekWeekday} - 1.861816 y_{DayOfWeekWeekend} - 2.341083 y_{TrafficConditionsHigh} - 2.255336 y_{TrafficConditionsLow} - 2.442432 y_{TrafficConditionsMedium} + 1.265286 y_{WeatherClear} + 1.204347 y_{WeatherRain} + 5.895151 y_{WeatherSnow} + 0.936819 y_{BaseFare} + 24.192633 y_{PerKmRate} + 57.727301 y_{PerMinuteRate} + 0.302297 y_{TripDurationMinutes} + 1.677901 y_{TripDistancekm}$