CS 109 Extension Project

In [1]:
print("Hello World!")

Hello World!


In [1]:
# Goal: Get Live Feed Data
import requests
import os
import time

# make a directory for the data
download_directory = "data"
# Create the directory if it doesn't exist
os.makedirs(download_directory, exist_ok=True)

# mark how many days in each month
days = {2: 28, 4: 30, 6: 30, 9: 30, 11: 30}

for i in range(1, 13):
    day = 0
    if i in days:
        day = days[i]
    else:
        day = 31

    for j in range(1, day + 1):
        file_name = f"subwaydatanyc_2023-{i:02}-{j:02}_csv.tar.xz"
        full_path = os.path.join(download_directory, file_name)

        url = f"https://subwaydata.nyc/data/{file_name}"

        try:
            response = requests.get(url)

            if response.status_code == 200:
                with open(full_path, "wb") as f:
                    f.write(response.content)
            else:
                print(f"Error {response.status_code} accessing: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Connection Error for {file_name}: {e}")
            time.sleep(5) # Wait longer after a connection error



In [6]:
# Extract from .tar.xz file extension
import tarfile
import os

# 1. Define the necessary file paths
data_directory = "data"
archive_file = os.path.join(data_directory, 'subwaydatanyc_2023-01-01_csv.tar.xz')
extract_dir = 'extracted_csv_data'
target_file_in_archive = 'subwaydatanyc_2023-01-01_stop_times.csv'

# 2. Create the extraction directory
os.makedirs(extract_dir, exist_ok=True)
print(f"Extraction directory is set to: {extract_dir}")

# 3. Open the archive and extract only the target file
try:
    print(f"Starting extraction of ONLY {target_file_in_archive} from {archive_file}...")

    # 'r:xz' opens the file for reading and handles the XZ decompression
    with tarfile.open(archive_file, 'r:xz') as tar:

        # Check if the desired file exists in the archive (good practice)
        if target_file_in_archive in tar.getnames():
            # Extract ONLY the specified file to the target directory
            tar.extract(target_file_in_archive, path=extract_dir)

            # 4. Construct the full path to the extracted file
            extracted_path = os.path.join(extract_dir, target_file_in_archive)

            print(f"\n✅ Success: '{target_file_in_archive}' extracted to: {extracted_path}")

        else:
            print(f"\n❌ Error: '{target_file_in_archive}' was not found inside the archive.")

except tarfile.ReadError:
    print(f"\n❌ Error: Could not read or decompress the archive '{archive_file}'. Check file integrity.")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

Extraction directory is set to: extracted_csv_data
Starting extraction of ONLY subwaydatanyc_2023-01-01_stop_times.csv from data/subwaydatanyc_2023-01-01_csv.tar.xz...

✅ Success: 'subwaydatanyc_2023-01-01_stop_times.csv' extracted to: extracted_csv_data/subwaydatanyc_2023-01-01_stop_times.csv


In [29]:
# Analyze File to Get Trend
import pandas as pd

df = pd.read_csv("extracted_csv_data/subwaydatanyc_2023-01-01_stop_times.csv")

df['next_actual_departure'] = df.groupby('trip_uid')['marked_past'].shift(-1)

df['actual_duration_seconds'] = (
    df['next_actual_departure'] - df['marked_past']
)

df['next_scheduled_departure'] = df['departure_time'].shift(-1)

df['expected_duration_seconds'] = (
    df['next_scheduled_departure'] - df['departure_time']
)

df_filtered = df[df['actual_duration_seconds'] > (df['expected_duration_seconds'] * 0.5)]
df_filtered = df_filtered[df_filtered['actual_duration_seconds'] > 30]

df_filtered['delay_added_seconds'] = (
    df_filtered['actual_duration_seconds'] - df_filtered['expected_duration_seconds']
)

# 8. Clean Up (Remove last stop of every trip)
df_final = df_filtered.dropna(subset=['delay_added_seconds'])

print(df_final.iloc[:, 10:])
print(df_final.shape[0])

        expected_duration_seconds  delay_added_seconds
0                           150.0                  0.0
1                           109.0                -19.0
2                            79.0                 11.0
3                            76.0                 -1.0
4                            81.0                  0.0
...                           ...                  ...
155523                      213.0                 -3.0
155524                      184.0                -73.0
155548                      160.0                -73.0
155572                      120.0                  0.0
155573                       66.0                 -4.0

[130929 rows x 2 columns]
130929


In [28]:
# Let's analyze this data!

F_delay = 0
F_count = 0
total_average = df_final['delay_added_seconds'].mean()

df_subway_only = df_final[~df_final['stop_id'].str.startswith('S')].copy()
df_f_train = df_subway_only[
    df_subway_only['trip_uid'].astype(str).str.contains('F')
].copy()
F_average = df_f_train['delay_added_seconds'].mean()

print(F_average)
print(total_average)

-0.006378189094547274
2.0375928938585033
