In [None]:
#%pip install pandas

In [1]:
%pip install pandas charset-normalizer -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
#########################################################################################
#
# Month tally script for NTSB AviationData.csv file.
# 
# Michael Evan
# UMassDartmouth Graduate Computer Science Department
# CIS 598 Data Visualization 
# Spring 2026
#
# This will detect encoding, read AviationData*.csv file, parse 'event.date',
# drop invalid dates, print month counts (Jan→Dec) and save monthly_counts_percent.csv.
#
#########################################################################################

import pandas as pd
import glob
import calendar
from pathlib import Path
from charset_normalizer import from_path
from io import StringIO

def detect_encoding(path):
    try:
        res = from_path(path)
        best = res.best()
        if best and best.encoding:
            return best.encoding
    except Exception:
        pass
    return None

def try_read(path):
    enc = detect_encoding(path)
    fallbacks = [enc, "utf-8", "cp1252", "latin-1"]
    tried = set()
    for e in fallbacks:
        if not e or e in tried:
            continue
        tried.add(e)
        try:
            # Use python engine for more tolerant parsing of messy CSVs
            df = pd.read_csv(path, encoding=e, low_memory=False, engine="python", on_bad_lines='warn')
            print(f"Read {path} with encoding={e}")
            return df
        except Exception as ex:
            print(f"Failed reading {path} with encoding={e}: {ex}")
    # Last-resort byte-replace (common Windows smart-quote 0x92 -> apostrophe)
    try:
        b = Path(path).read_bytes()
        b2 = b.replace(b'\x92', b"'")
        s = b2.decode("utf-8", errors="replace")
        df = pd.read_csv(StringIO(s), engine="python", on_bad_lines='warn')
        print(f"Read {path} after byte-replace with utf-8 (fallback).")
        return df
    except Exception as ex:
        raise RuntimeError(f"Unable to read {path}: {ex}")

# Find CSV(s) matching the provided attachments
files = sorted(glob.glob("AviationData*.csv"))
if not files:
    raise FileNotFoundError("No files matching 'AviationData*.csv' found in current directory.")

dfs = []
for f in files:
    dfs.append(try_read(f))

# Concatenate if multiple files found
df = pd.concat(dfs, ignore_index=True)

# Locate the event date column (prefer exact 'event.date', else try pattern)
if 'event.date' in df.columns:
    date_col = 'event.date'
else:
    # try to find a column name that looks like 'event' and 'date'
    candidates = [c for c in df.columns if ('event' in c.lower() and 'date' in c.lower()) or c.lower().strip() in ('eventdate','event_date','date','occurrence_date')]
    if candidates:
        date_col = candidates[0]
        print(f"Using detected date column: '{date_col}'")
    else:
        raise KeyError("Column 'event.date' not found and no obvious alternative found. Columns: " + ", ".join(df.columns[:50]))

# Parse dates and drop invalid
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
total_rows = len(df)
df_valid = df.dropna(subset=[date_col]).copy()
valid_rows = len(df_valid)
dropped = total_rows - valid_rows

# Month counts and percentages (1..12)
df_valid['month'] = df_valid[date_col].dt.month
counts = df_valid['month'].value_counts().reindex(range(1,13), fill_value=0).sort_index()
total = int(counts.sum())

months = []
for m in range(1,13):
    cnt = int(counts.loc[m])
    pct = 100.0 * cnt / total if total else 0.0
    months.append((m, calendar.month_name[m], cnt, round(pct, 2)))

out = pd.DataFrame(months, columns=['month_num','month_name','count','percent'])
out.to_csv("monthly_counts_percent.csv", index=False)

# Print summary and month-by-month results
print(f"\nFiles processed: {files}")
print(f"Total rows (all files): {total_rows}")
print(f"Rows with valid {date_col}: {valid_rows}")
print(f"Rows dropped (invalid/missing {date_col}): {dropped}\n")

print("Counts and percent share by month (January → December):")
for _, name, cnt, pct in months:
    print(f"{name:9s}: {cnt:6d}  — {pct:5.2f}%")

print("\nSaved results to monthly_counts_percent.csv")

Read AviationData.csv with encoding=cp1250
Using detected date column: 'Event.Date'

Files processed: ['AviationData.csv']
Total rows (all files): 88889
Rows with valid Event.Date: 88889
Rows dropped (invalid/missing Event.Date): 0

Counts and percent share by month (January → December):
January  :   4985  —  5.61%
February :   5285  —  5.95%
March    :   6686  —  7.52%
April    :   7248  —  8.15%
May      :   8514  —  9.58%
June     :   9561  — 10.76%
July     :  10698  — 12.04%
August   :   9986  — 11.23%
September:   8346  —  9.39%
October  :   6982  —  7.85%
November :   5538  —  6.23%
December :   5060  —  5.69%

Saved results to monthly_counts_percent.csv
