CS 109 Extension Project

In [1]:
print("Hello World!")

Hello World!


In [1]:
# Goal: Get Live Feed Data
import requests
import os
import time

# make a directory for the data
download_directory = "data"
# Create the directory if it doesn't exist
os.makedirs(download_directory, exist_ok=True)

# mark how many days in each month
days = {2: 28, 4: 30, 6: 30, 9: 30, 11: 30}

for i in range(1, 13):
    day = 0
    if i in days:
        day = days[i]
    else:
        day = 31

    for j in range(1, day + 1):
        file_name = f"subwaydatanyc_2023-{i:02}-{j:02}_csv.tar.xz"
        full_path = os.path.join(download_directory, file_name)

        url = f"https://subwaydata.nyc/data/{file_name}"

        try:
            response = requests.get(url)

            if response.status_code == 200:
                with open(full_path, "wb") as f:
                    f.write(response.content)
            else:
                print(f"Error {response.status_code} accessing: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Connection Error for {file_name}: {e}")
            time.sleep(5) # Wait longer after a connection error



In [35]:
# Extract from .tar.xz file extension
import tarfile
import os

# 1. Define the necessary file paths
data_directory = "data"
extract_dir = 'extracted_csv_data'
os.makedirs(extract_dir, exist_ok=True)

for i in range(1, 13):
    day = 0
    if i in days:
        day = days[i]
    else:
        day = 31

    for j in range(1, day + 1):
        # Define files to grab
        archive_file = os.path.join(data_directory, f'subwaydatanyc_2023-{i:02}-{j:02}_csv.tar.xz')
        target_file_in_archive = f'subwaydatanyc_2023-{i:02}-{j:02}_stop_times.csv'

        # 3. Open the archive and extract only the target file
        try:
            print(f"Starting extraction of ONLY {target_file_in_archive} from {archive_file}...")

            # 'r:xz' opens the file for reading and handles the XZ decompression
            with tarfile.open(archive_file, 'r:xz') as tar:

                # Check if the desired file exists in the archive (good practice)
                if target_file_in_archive in tar.getnames():
                    # Extract ONLY the specified file to the target directory
                    tar.extract(target_file_in_archive, path=extract_dir)

                    # 4. Construct the full path to the extracted file
                    extracted_path = os.path.join(extract_dir, target_file_in_archive)

        except tarfile.ReadError:
            print(f"\n❌ Error: Could not read or decompress the archive '{archive_file}'. Check file integrity.")
        except Exception as e:
            print(f"\n❌ An unexpected error occurred: {e}")

Starting extraction of ONLY subwaydatanyc_2023-01-01_stop_times.csv from data/subwaydatanyc_2023-01-01_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-02_stop_times.csv from data/subwaydatanyc_2023-01-02_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-03_stop_times.csv from data/subwaydatanyc_2023-01-03_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-04_stop_times.csv from data/subwaydatanyc_2023-01-04_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-05_stop_times.csv from data/subwaydatanyc_2023-01-05_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-06_stop_times.csv from data/subwaydatanyc_2023-01-06_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-07_stop_times.csv from data/subwaydatanyc_2023-01-07_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-08_stop_times.csv from data/subwaydatanyc_2023-01-08_csv.tar.xz...
Starting extraction of ONLY subwaydatanyc_2023-01-09_stop_times.

In [32]:
# Analyze File to Get Trend
import pandas as pd

df = pd.read_csv("extracted_csv_data/subwaydatanyc_2023-01-01_stop_times.csv")

df['next_actual_departure'] = df.groupby('trip_uid')['marked_past'].shift(-1)

df['actual_duration_seconds'] = (
    df['next_actual_departure'] - df['marked_past']
)

df['next_scheduled_departure'] = df['departure_time'].shift(-1)

df['expected_duration_seconds'] = (
    df['next_scheduled_departure'] - df['departure_time']
)

df_filtered = df[df['actual_duration_seconds'] > (df['expected_duration_seconds'] * 0.5)]
df_filtered = df_filtered[df_filtered['actual_duration_seconds'] > 30]

df_filtered['delay_added_seconds'] = (
    df_filtered['actual_duration_seconds'] - df_filtered['expected_duration_seconds']
)

# 8. Clean Up (Remove last stop of every trip)
df_final = df_filtered.dropna(subset=['delay_added_seconds'])



        expected_duration_seconds  delay_added_seconds
0                           150.0                  0.0
1                           109.0                -19.0
2                            79.0                 11.0
3                            76.0                 -1.0
4                            81.0                  0.0
...                           ...                  ...
155523                      213.0                 -3.0
155524                      184.0                -73.0
155548                      160.0                -73.0
155572                      120.0                  0.0
155573                       66.0                 -4.0

[130929 rows x 2 columns]
130929
                 trip_uid stop_id  delay_added_seconds
18888     1672576650_2..N    123N               -226.0
41870  1672601580_2..S01R    132S               -224.0
36097  1672594920_3..N01R    120N               -215.0
18787  1672576590_4..S06R    235S               -200.0
39875  1672599240_3..N01R    12

In [34]:
# Let's analyze this data!

total_average = df_final['delay_added_seconds'].mean()

df_subway_only = df_final[~df_final['stop_id'].str.startswith('S')].copy()
df_f_train = df_subway_only[
    df_subway_only['trip_uid'].astype(str).str.contains('F')
].copy()

two_train_df = df_final[
    df_final['trip_uid'].str.contains(r'_2\.\.', regex=True)
].copy()

six_train_df = df_final[
    df_final['trip_uid'].str.contains(r'_6\.\.', regex=True)
].copy()

two_avg = two_train_df['delay_added_seconds'].mean()
six_avg = six_train_df['delay_added_seconds'].mean()
F_average = df_f_train['delay_added_seconds'].mean()

print(F_average)
print(two_avg)
print(six_avg)
print(total_average)

-0.006378189094547274
5.210709504685409
1.9174973488865323
2.0375928938585033


In [40]:
# Generalized Analysi# Analyze File to Get Trend
import pandas as pd

totals = []
f_data = []

for i in range(1, 13):
    day = 0
    if i in days:
        day = days[i]
    else:
        day = 31

    for j in range(1, day + 1):
        df = pd.read_csv(f"extracted_csv_data/subwaydatanyc_2023-{i:02}-{j:02}_stop_times.csv")

        df['next_actual_departure'] = df.groupby('trip_uid')['marked_past'].shift(-1)

        df['actual_duration_seconds'] = (
            df['next_actual_departure'] - df['marked_past']
        )

        df['next_scheduled_departure'] = df['departure_time'].shift(-1)

        df['expected_duration_seconds'] = (
            df['next_scheduled_departure'] - df['departure_time']
        )

        df_filtered = df[df['actual_duration_seconds'] > (df['expected_duration_seconds'] * 0.5)]
        df_filtered = df_filtered[df_filtered['actual_duration_seconds'] > 30]

        df_filtered['delay_added_seconds'] = (
            df_filtered['actual_duration_seconds'] - df_filtered['expected_duration_seconds']
        )

        # 8. Clean Up (Remove last stop of every trip)
        df_final = df_filtered.dropna(subset=['delay_added_seconds'])

        df_internal_run = (
            df_final.groupby('trip_uid')
            .tail(-1)
        )

        df_subway_only = df_internal_run[~df_internal_run['stop_id'].str.startswith('S')].copy()


        df_f_train = df_subway_only[
            df_subway_only['trip_uid'].astype(str).str.contains('F')
        ].copy()

        total_average = df_subway_only['delay_added_seconds'].mean()
        totals.append(total_average)
        F_average = df_f_train['delay_added_seconds'].mean()
        f_data.append(F_average)

total_avg = sum(totals) / len(totals)
f_avg = sum(f_data) / len(f_data)

print(total_avg)
print(f_avg)

2.943504375013281
0.4147564708752325
