In [None]:
# ---# Q1: Weekly average of Population Staying at Home
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
from dask.distributed import Client
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# ---Load and Clean Data ---
df = pd.read_csv("Trips_by_Distance.csv")
df = df[df["Level"] == "National"]
df["Date"] = pd.to_datetime(df["Date"])
df["Week"] = df["Date"].dt.isocalendar().week

In [None]:
# ---Q1 ---
weekly_avg = df.groupby("Week")["Population Staying at Home"].mean()
weekly_avg.plot(kind="bar", title="Weekly Avg Staying at Home", figsize=(12,5))
plt.xlabel("Week")
plt.ylabel("Avg Population")
plt.tight_layout()
plt.show()

In [None]:
# ---Q2 ---
high_10_25 = df[df["Number of Trips 10-25"] > 1e7]
high_50_100 = df[df["Number of Trips 50-100"] > 1e7]
plt.figure(figsize=(12, 5))
plt.scatter(high_10_25["Date"], high_10_25["Number of Trips 10-25"], label="10-25 miles")
plt.scatter(high_50_100["Date"], high_50_100["Number of Trips 50-100"], label="50-100 miles")
plt.legend()
plt.title("Dates with >10M Trips")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# ---Q3 ---
dtypes = {
    "Population Staying at Home": "float64",
    "Level": "object",
    "Date": "object"
}
df_benchmark = dd.read_csv("Trips_by_Distance.csv", dtype=dtypes,
                           usecols=["Population Staying at Home", "Level", "Date"],
                           blocksize=None, assume_missing=True)
df_benchmark = df_benchmark[df_benchmark["Level"] == "National"]
df_benchmark["Date"] = dd.to_datetime(df_benchmark["Date"])
df_benchmark["Week"] = df_benchmark["Date"].dt.isocalendar().week

def calc_weekly_avg(dataset):
    return dataset.groupby("Week")["Population Staying at Home"].mean().compute()

worker_counts = [1, 4, 8]
for n in worker_counts:
    try:
        print(f"Running with {n} workers:")
        client = Client(n_workers=n, threads_per_worker=1)
        start = time.time()
        calc_weekly_avg(df_benchmark)
        elapsed = time.time() - start
        client.shutdown()
        print(f"{n} workers: {elapsed:.2f} seconds")
    except Exception as e:
        print(f"Error with {n} workers: {e}")

In [None]:
# ---Q4 ---
df_model = df[["Number of Trips 10-25", "Number of Trips 5-10"]].dropna()
X = df_model[["Number of Trips 10-25"]]
y = df_model["Number of Trips 5-10"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear Regression
lin_model = LinearRegression().fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)

# Polynomial Regression
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
poly_model = LinearRegression().fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(poly.transform(X_test))

plt.figure(figsize=(10, 5))
plt.scatter(X_test, y_test, label="Actual", color="gray")
plt.plot(X_test, y_pred_lin, label="Linear", color="blue")
plt.plot(X_test, y_pred_poly, label="Polynomial", color="red", linestyle="--")
plt.xlabel("Trips 10–25")
plt.ylabel("Trips 5–10")
plt.title("Regression Predictions")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# ---Q5 ---
distance_cols = [
    "Number of Trips <1", "Number of Trips 1-3", "Number of Trips 3-5",
    "Number of Trips 5-10", "Number of Trips 10-25", "Number of Trips 25-50",
    "Number of Trips 50-100", "Number of Trips 100-250",
    "Number of Trips 250-500", "Number of Trips >=500"
]
df[distance_cols].mean().plot(kind="bar", figsize=(12, 5), title="Avg Trip Count by Distance", color="teal")
plt.ylabel("Trip Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# ---Q6 ---
df[["Week", "Number of Trips 1-3", "Number of Trips 3-5"]].groupby("Week").mean().plot(
    kind="bar", stacked=True, figsize=(12, 5), title="Simplified Weekly Travel Summary")
plt.xlabel("Week")
plt.ylabel("Avg Trip Count")
plt.tight_layout()
plt.show()