<a href="https://colab.research.google.com/github/Gouthammajjari/Intro_to_ml/blob/main/project/Model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [2]:

#Use the URL for the raw CSV data
url = 'https://raw.githubusercontent.com/Gouthammajjari/Intro_to_ml/main/project/data_preprocessed.csv'
df = pd.read_csv(url)

# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,day,hour,weekday,month,year
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,7,19,3,5,2015
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,17,20,4,7,2009
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,24,21,0,8,2009
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,26,8,4,6,2009
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,28,17,3,8,2014


In [3]:

X = df.drop('fare_amount',axis=1)
y = df['fare_amount']


In [4]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Linear Regression(without standardizing the inputs)

In [5]:

model = LinearRegression()
model.fit(X_train, y_train)

In [6]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290913, 0.6186443704275573)

In [7]:
y_pred = model.predict(X_test)

In [8]:

mse = mean_squared_error(y_test, y_pred)
mse

38.82264089139763

In [9]:

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(2.4913451372102484, 0.6186443704275573)

In [10]:
rmse = np.sqrt(mse)
rmse

6.230781723941036

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Using MinMax Scaler to normalize the inputs


In [12]:
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

In [13]:
X = df.drop('fare_amount',axis=1)
y = df[['fare_amount']]

In [14]:

X = pd.DataFrame(scaler_x.fit_transform(X), columns=X.columns, index=X.index).values
y = pd.DataFrame(scaler_y.fit_transform(y), columns=y.columns, index=y.index).values


In [15]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:

model = LinearRegression()
model.fit(X_train, y_train)

In [17]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290917, 0.6186443704275568)

In [18]:
y_pred = model.predict(X_test)

In [19]:
mse = mean_squared_error(y_test, y_pred)
mse

0.00015591984351807423

In [20]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(0.004992775681296209, 0.6186443704275568)

In [21]:

rmse = np.sqrt(mse)
rmse


0.012486786757131486

# Using Standard Scaler to normalize the inputs

In [22]:
scaler_x = StandardScaler()
scaler_y = StandardScaler()


In [23]:
X = df.drop('fare_amount',axis=1)
y = df[['fare_amount']]

In [24]:
X = pd.DataFrame(scaler_x.fit_transform(X), columns=X.columns, index=X.index).values
y = pd.DataFrame(scaler_y.fit_transform(y), columns=y.columns, index=y.index).values

In [25]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:

model = LinearRegression()
model.fit(X_train, y_train)

In [27]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290915, 0.6186443704275573)

In [28]:
y_pred = model.predict(X_test)

In [29]:

mse = mean_squared_error(y_test, y_pred)
mse


0.3976170686868417

In [30]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(0.25212961969527936, 0.6186443704275573)

In [31]:
rmse = np.sqrt(mse)
rmse

0.6305688453189244

# Regularization model(Ridge Regression)

In [32]:
from sklearn.linear_model import Ridge
rid = Ridge()
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6882086693932788, 0.6183417185078421)

In [33]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2


(0.3979326260296939, 0.25184038856211594, 0.6183417185078421)

In [34]:

rmse = np.sqrt(mse)
rmse

0.630819012102278

Using Regularization term (lambda = 5)

In [35]:

from sklearn.linear_model import Ridge
rid = Ridge(alpha = 5.0)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6874688613976456, 0.6176425872553735)

In [36]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.39866156903637145, 0.2519509772511201, 0.6176425872553735)

In [37]:
rmse = np.sqrt(mse)
rmse

0.6313965228256895

Using Regularization term (lambda = 10)

In [38]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 10.0)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6872108146688205, 0.6173999162977113)

In [39]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2


(0.3989145877605193, 0.252032396286686, 0.6173999162977113)

In [40]:
rmse = np.sqrt(mse)
rmse

0.6315968554074025

Using Regularization term (lambda = 0.1)

In [41]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 0.1)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6885157631567227, 0.6186353351321203)

In [42]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.397626489257584, 0.2520520785429989, 0.6186353351321203)

In [43]:
rmse = np.sqrt(mse)
rmse

0.6305763151733372