In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from multiprocessing import Pool
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# File paths (as provided)
trips_df = pd.read_csv("Data/datasets/trips_full_data.csv")
distance_df = pd.read_csv("Data/datasets/Trips_by_Distance.csv")


In [None]:
print("Trips Dataset Columns:", trips_df.columns)
print("Distance Dataset Columns:", distance_df.columns)
display(trips_df.head())
display(distance_df.head())


In [None]:
trips_df['Population_Staying_Home'] = trips_df['Population'] - trips_df['Population_Not_Staying_at_Home']
daily_home_stay = trips_df[['Date', 'Population_Staying_Home']].groupby('Date').sum()
daily_home_stay.sort_values(by='Population_Staying_Home', ascending=False).head()


In [None]:
distance_df.columns = distance_df.columns.str.strip()
sns.barplot(data=distance_df, x='Distance Bin', y='Number of Participants')
plt.xticks(rotation=45)
plt.title("Number of Participants by Trip Distance")
plt.tight_layout()
plt.show()


In [None]:
trips_10_25 = trips_df[(trips_df['Trip_Bin'] == '10-25') & (trips_df['People_Count'] > 10_000_000)]
trips_50_100 = trips_df[(trips_df['Trip_Bin'] == '50-100') & (trips_df['People_Count'] > 10_000_000)]

dates_10_25 = set(trips_10_25['Date'])
dates_50_100 = set(trips_50_100['Date'])
common_dates = dates_10_25.intersection(dates_50_100)

print("Dates >10M (10–25 trips):", sorted(dates_10_25))
print("Dates >10M (50–100 trips):", sorted(dates_50_100))
print("Common Dates:", sorted(common_dates))


In [None]:
large_data = np.random.rand(1_000_000) * 100

def heavy_function(x):
    return x ** 0.5

# Sequential
start_time = time.time()
sequential_result = [heavy_function(x) for x in large_data]
sequential_time = time.time() - start_time
print(f"Sequential: {sequential_time:.2f} sec")

# Parallel (8 cores max)
def run_parallel(data, num_processes):
    with Pool(processes=num_processes) as pool:
        start = time.time()
        result = pool.map(heavy_function, data)
        duration = time.time() - start
    return result, duration

_, parallel_time_8 = run_parallel(large_data, 8)
print(f"Parallel (8 cores): {parallel_time_8:.2f} sec")

# NumPy (baseline)
start_np = time.time()
np_result = np.sqrt(large_data)
np_time = time.time() - start_np
print(f"NumPy vectorized: {np_time:.2f} sec")


In [None]:
plt.figure(figsize=(6, 4))
plt.bar(['Sequential', 'Parallel (8)', 'NumPy'],
        [sequential_time, parallel_time_8, np_time],
        color=['red', 'green', 'blue'])
plt.ylabel('Processing Time (seconds)')
plt.title('Sequential vs. Parallel vs. NumPy')
plt.tight_layout()
plt.show()


In [None]:
def average_bin(bin_str):
    parts = bin_str.replace('+', '').split('-')
    if len(parts) == 2:
        return (float(parts[0]) + float(parts[1])) / 2
    else:
        return float(parts[0])

distance_df['AvgDistance'] = distance_df['Distance Bin'].apply(average_bin)
distance_df.rename(columns={'Number of Participants': 'Participants'}, inplace=True)

X = distance_df[['AvgDistance']]
y = distance_df['Participants']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(X, y, color='blue', label='Actual')
plt.plot(X, model.predict(X), color='red', linewidth=2, label='Model')
plt.xlabel("Average Trip Distance")
plt.ylabel("Participants")
plt.title("Model: Participants vs. Trip Distance")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
