In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer

In [26]:
# Load the dataset
data = pd.read_csv('uber.csv')

In [27]:
# Preprocess the dataset
# Convert pickup_datetime to datetime format
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

In [28]:
# Extract year, month, day, hour from pickup_datetime
data['year'] = data['pickup_datetime'].dt.year
data['month'] = data['pickup_datetime'].dt.month
data['day'] = data['pickup_datetime'].dt.day
data['hour'] = data['pickup_datetime'].dt.hour

In [29]:
# Drop pickup_datetime
data.drop('pickup_datetime', axis=1, inplace=True)

In [30]:
# Identify outliers
# Check for outliers in fare_amount
z_scores = np.abs(data['fare_amount'] - data['fare_amount'].mean()) / data['fare_amount'].std()
outliers = data[z_scores > 3]

In [31]:
# Remove outliers
data = data.drop(outliers.index, axis=0)

In [32]:
# Check for correlation
correlation_matrix = data.corr()

  correlation_matrix = data.corr()


In [33]:
# Implement linear regression
X = data[['year', 'month', 'day', 'hour', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
y = data['fare_amount']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [45]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [46]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [47]:
linear_reg = LinearRegression()
linear_reg.fit(X_train_imputed, y_train)

In [48]:
# Evaluate linear regression model
y_pred_linear = linear_reg.predict(X_test_scaled)

r2_score_linear = r2_score(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))

print('Linear regression:')
print('R2 score:', r2_score_linear)
print('RMSE:', rmse_linear)

Linear regression:
R2 score: -20409.617974266992
RMSE: 925.594808812732


In [43]:
# Implement random forest regression
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train_imputed, y_train)

In [49]:
# Evaluate random forest regression model
y_pred_random_forest = random_forest_reg.predict(X_test)

r2_score_random_forest = r2_score(y_test, y_pred_random_forest)
rmse_random_forest = np.sqrt(mean_squared_error(y_test, y_pred_random_forest))

print('Random forest regression:')
print('R2 score:', r2_score_random_forest)
print('RMSE:', rmse_random_forest)



Random forest regression:
R2 score: 0.9458318156391522
RMSE: 1.5078737215824565


In [56]:
# Compare models
print('Comparison:')
print('Model\t\t\t\t\tR2 score\t\t\tRMSE')
print('Linear regression\t\t', r2_score_linear, '\t\t', rmse_linear)
print('Random forest regression\t', r2_score_random_forest, '\t\t', rmse_random_forest)

Comparison:
Model					R2 score			RMSE
Linear regression		 -20409.617974266992 		 925.594808812732
Random forest regression	 0.9458318156391522 		 1.5078737215824565
