In [1]:
import pandas as pd

# Load NTSB and ASRS data
def load_ntsb_data(file_path):
    """
    Loads NTSB crash report data from a CSV file into a pandas DataFrame.
    """
    try:
        ntsb_df = pd.read_csv(file_path, encoding="utf-8-sig", low_memory=False, on_bad_lines='skip', quoting=1)
        print(f"NTSB Data loaded successfully with {len(ntsb_df)} rows.")
        return ntsb_df
    except Exception as e:
        print(f"Error loading NTSB data: {e}")
        return None

def load_asrs_data(file_path):
    """
    Loads ASRS crash report data from a CSV file into a pandas DataFrame.
    """
    try:
        asrs_df = pd.read_csv(file_path, encoding="utf-8-sig", low_memory=False, on_bad_lines='skip', quoting=1)
        print(f"ASRS Data loaded successfully with {len(asrs_df)} rows.")
        return asrs_df
    except Exception as e:
        print(f"Error loading ASRS data: {e}")
        return None

# File paths to your datasets
ntsb_file_path = '../data/raw/ntsb_crash_reports.csv'  # Update with your actual path
asrs_file_path = '../data/raw/asrs_crash_reports.csv'  # Update with your actual path

# Load datasets
ntsb_data = load_ntsb_data(ntsb_file_path)
asrs_data = load_asrs_data(asrs_file_path)

# Check basic structure of the NTSB data (columns, missing values)
if ntsb_data is not None:
    print("\nNTSB Data Overview:")
    print(ntsb_data.info())  # Check for data types, missing values, etc.
    print(ntsb_data.head())  # Preview the first few rows

# Check basic structure of the ASRS data (columns, missing values)
if asrs_data is not None:
    print("\nASRS Data Overview:")
    print(asrs_data.info())  # Check for data types, missing values, etc.
    print(asrs_data.head())  # Preview the first few rows


NTSB Data loaded successfully with 176620 rows.
ASRS Data loaded successfully with 2498 rows.

NTSB Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176620 entries, 0 to 176619
Data columns (total 38 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   NtsbNo               176620 non-null  object 
 1   EventType            176590 non-null  object 
 2   Mkey                 176620 non-null  int64  
 3   EventDate            176613 non-null  object 
 4   City                 176559 non-null  object 
 5   State                167321 non-null  object 
 6   Country              176112 non-null  object 
 7   ReportNo             359 non-null     object 
 8   N                    176517 non-null  object 
 9   HasSafetyRec         176620 non-null  bool   
 10  ReportType           169542 non-null  object 
 11  OriginalPublishDate  156377 non-null  object 
 12  HighestInjuryLevel   71903 non-null   object 
 13  Fata

In [7]:
# Import necessary libraries
import pandas as pd

# Load NTSB and ASRS datasets
ntsb_data = pd.read_csv('../data/raw/ntsb_crash_reports.csv', encoding='ISO-8859-1')
asrs_data = pd.read_csv('../data/raw/asrs_crash_reports.csv', encoding='ISO-8859-1')

# Preview NTSB Data
print("NTSB Data loaded successfully with", ntsb_data.shape[0], "rows.")
print("\nNTSB Data Overview:")
print(ntsb_data.info())
print("\n", ntsb_data.head())

# Preview ASRS Data
print("\nASRS Data loaded successfully with", asrs_data.shape[0], "rows.")
print("\nASRS Data Overview:")
print(asrs_data.info())
print("\n", asrs_data.head())

# 1. Clean NTSB Data
# Drop unnecessary columns (e.g., 'Unnamed: 37' which has no data)
ntsb_data_cleaned = ntsb_data.drop(columns=['Unnamed: 37'])

# Fill missing values or drop rows/columns with high null values
ntsb_data_cleaned = ntsb_data_cleaned.dropna(subset=['EventDate', 'Make', 'Model', 'ProbableCause'])

# Handle missing weather data by dropping or imputing (impute with 'Unknown')
ntsb_data_cleaned['WeatherCondition'] = ntsb_data_cleaned['WeatherCondition'].fillna('Unknown')

# 2. Clean ASRS Data
# Drop columns that don't provide value for your analysis
asrs_data_cleaned = asrs_data.drop(columns=['Unnamed: 125', 'Report 1', 'Report 1.1', 'Report 2', 'Report 2.1'])

# Drop rows with missing time and place values
asrs_data_cleaned = asrs_data_cleaned.dropna(subset=['Time', 'Place'])

# Check for duplicate rows in ASRS data and remove them
asrs_data_cleaned = asrs_data_cleaned.drop_duplicates()

# 3. Explore NTSB Data: Frequency of accidents by state and injury level
accidents_by_state = ntsb_data_cleaned['State'].value_counts().head(10)
accidents_by_injury = ntsb_data_cleaned[['FatalInjuryCount', 'SeriousInjuryCount', 'MinorInjuryCount']].sum()

# 4. Explore ASRS Data: Top reported issues
top_issues = asrs_data_cleaned['Assessments'].value_counts().head(10)

# Display cleaned data insights
print("\nTop 10 States by Frequency of Accidents (NTSB Data):")
print(accidents_by_state)

print("\nTotal Injury Counts (NTSB Data):")
print(accidents_by_injury)

print("\nTop 10 Reported Issues (ASRS Data):")
print(top_issues)

# Optional: Save cleaned data to new CSV files
ntsb_data_cleaned.to_csv('../data/processed/ntsb_crash_reports_cleaned.csv', index=False)
asrs_data_cleaned.to_csv('../data/processed/asrs_reports_cleaned.csv', index=False)

print("\nCleaned NTSB and ASRS data saved.")

  ntsb_data = pd.read_csv('../data/raw/ntsb_crash_reports.csv', encoding='ISO-8859-1')


NTSB Data loaded successfully with 176620 rows.

NTSB Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176620 entries, 0 to 176619
Data columns (total 38 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ï»¿"NtsbNo"          176620 non-null  object 
 1   EventType            176590 non-null  object 
 2   Mkey                 176620 non-null  int64  
 3   EventDate            176613 non-null  object 
 4   City                 176559 non-null  object 
 5   State                167321 non-null  object 
 6   Country              176112 non-null  object 
 7   ReportNo             359 non-null     object 
 8   N                    176517 non-null  object 
 9   HasSafetyRec         176620 non-null  bool   
 10  ReportType           169542 non-null  object 
 11  OriginalPublishDate  156377 non-null  object 
 12  HighestInjuryLevel   71903 non-null   object 
 13  FatalInjuryCount     176620 non-null  int64  
 14 