In [0]:
df = spark.table("big_data_workshop.gold_data.clean_df")

df.printSchema()
df.show(5)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+---

In [0]:
from pyspark.sql.functions import hour, dayofweek

feature_df = df.select(
    "trip_distance",
    "passenger_count",
    "fare_amount",
    hour("tpep_pickup_datetime").alias("pickup_hour"),
    dayofweek("tpep_pickup_datetime").alias("pickup_day")
).dropna()

feature_df.show(5)

+-------------+---------------+-----------+-----------+----------+
|trip_distance|passenger_count|fare_amount|pickup_hour|pickup_day|
+-------------+---------------+-----------+-----------+----------+
|          1.5|              1|        7.0|          0|         3|
|          2.6|              1|       14.0|          0|         3|
|          1.3|              1|        6.5|          0|         3|
|          3.7|              1|       13.5|          0|         3|
|          2.1|              2|       10.0|          0|         3|
+-------------+---------------+-----------+-----------+----------+
only showing top 5 rows


In [0]:
feature_df = feature_df.filter(
    (feature_df.trip_distance < 50) &
    (feature_df.fare_amount < 200)
)

In [0]:
pandas_df = feature_df.sample(fraction=0.05, seed=42).toPandas()

pandas_df.head()

Unnamed: 0,trip_distance,passenger_count,fare_amount,pickup_hour,pickup_day
0,2.1,2,10.0,0,3
1,5.2,1,25.5,0,3
2,2.08,2,12.5,0,3
3,2.68,1,16.5,0,3
4,1.07,1,7.5,0,3


In [0]:
from sklearn.model_selection import train_test_split

X = pandas_df[[
    "trip_distance",
    "passenger_count",
    "pickup_hour",
    "pickup_day"
]]

y = pandas_df["fare_amount"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_predictions = lr_model.predict(X_test)

In [0]:
from sklearn.metrics import mean_absolute_error, r2_score

print("Linear Regression Results")
print("MAE:", mean_absolute_error(y_test, lr_predictions))
print("R2 Score:", r2_score(y_test, lr_predictions))

Linear Regression Results
MAE: 1.6655183027216036
R2 Score: 0.9019952635196555


In [0]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

print("Random Forest Results")
print("MAE:", mean_absolute_error(y_test, rf_predictions))
print("R2 Score:", r2_score(y_test, rf_predictions))

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
)

importance.sort_values().plot(kind="barh")
plt.title("Feature Importance")
plt.show()

In [0]:
plt.scatter(y_test, rf_predictions)
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.title("Actual vs Predicted (Random Forest)")
plt.show()

In [0]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, lr_predictions),
        mean_absolute_error(y_test, rf_predictions)
    ],
    "R2": [
        r2_score(y_test, lr_predictions),
        r2_score(y_test, rf_predictions)
    ]
})

results