# MINI PROJECT â€” APPLIED DATA ANALYSIS & MODELING (PART A)

Airline On-Time Performance & Passenger Experience

This Colab notebook loads the Kaggle "Airline Delay and Cancellation Data 2019" (flights.csv), performs EDA, runs hypothesis tests between airlines, fits simple and multiple regression models, and saves cleaned data + plots into a `results/` folder.

Instructions:
- If you want automatic download via Kaggle, create `~/.kaggle/kaggle.json` with your credentials (see next cell).
- Run cells sequentially. Plots save to `results/plots/` and cleaned CSV to `results/cleaned_data/clean_flights.csv`.

## Optional: configure Kaggle credentials (only required for automatic download)
Run these in Colab if you want the notebook to download the dataset automatically. Replace with your credentials.

In [None]:
# !mkdir -p ~/.kaggle
# !echo '{"username":"YOUR_KAGGLE_USERNAME","key":"YOUR_KAGGLE_API_KEY"}' > ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d usdot/flight-delays
# !unzip -o flight-delays.zip

print('If you want to use Kaggle automatic download, uncomment and run the commands above and add your credentials.')

## 1) Imports and settings

In [None]:
import os
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

RESULTS_DIR = 'results'
CLEANED_DIR = os.path.join(RESULTS_DIR, 'cleaned_data')
PLOTS_DIR = os.path.join(RESULTS_DIR, 'plots')
SUMMARY_DIR = os.path.join(RESULTS_DIR, 'summary_reports')
for d in (RESULTS_DIR, CLEANED_DIR, PLOTS_DIR, SUMMARY_DIR):
    os.makedirs(d, exist_ok=True)


## 2) Load dataset (flights.csv)
If you uploaded `flights.csv` to the Colab session, it will load. Otherwise enable Kaggle download earlier.

In [None]:
CSV_FILE = 'flights.csv'
if not os.path.exists(CSV_FILE):
    raise FileNotFoundError("Please upload 'flights.csv' to the Colab session or enable Kaggle download.")

df = pd.read_csv(CSV_FILE, low_memory=False)
print('Loaded', CSV_FILE, 'shape =', df.shape)


## 3) Initial inspection

In [None]:
display(df.head())
print('\nInfo:')
print(df.info())
print('\nMissing values (top 20):')
print(df.isnull().sum().sort_values(ascending=False).head(20))


## 4) Select relevant columns and clean
We map common column names to canonical names and build a working DataFrame.

In [None]:
col_map_candidates = {
    'Year': ['Year', 'year'],
    'Month': ['Month', 'month'],
    'Day': ['DayofMonth', 'Day', 'day'],
    'Carrier': ['UniqueCarrier', 'Carrier', 'OP_UNIQUE_CARRIER', 'Reporting_Airline'],
    'FlightNum': ['FlightNum', 'FlightNumber', 'Flight'],
    'Origin': ['Origin', 'ORIGIN', 'origin'],
    'Dest': ['Dest', 'DEST', 'dest'],
    'DepDelay': ['DepDelay', 'DEP_DELAY', 'DepDelayMinutes'],
    'ArrDelay': ['ArrDelay', 'ARR_DELAY', 'ArrDelayMinutes'],
    'Distance': ['Distance', 'DISTANCE'],
    'Cancelled': ['Cancelled', 'CANCELLED'],
}
found_cols = {}
for canonical, candidates in col_map_candidates.items():
    for c in candidates:
        if c in df.columns:
            found_cols[canonical] = c
            break

print('Found columns mapping:', found_cols)
use_cols = list(found_cols.values())
working = df[use_cols].copy()
inv_map = {v: k for k, v in found_cols.items()}
working.rename(columns=inv_map, inplace=True)

for col in ('ArrDelay', 'DepDelay', 'Distance', 'Month', 'Day', 'Year'):
    if col in working.columns:
        working[col] = pd.to_numeric(working[col], errors='coerce')

if 'Cancelled' in working.columns:
    working['Cancelled'] = pd.to_numeric(working['Cancelled'], errors='coerce').fillna(0).astype(int)
else:
    working['Cancelled'] = 0

working['Delay'] = working.get('ArrDelay').fillna(working.get('DepDelay'))
working = working[working['Delay'].notna()]
print('Rows after keeping those with delay info:', working.shape[0])


## 5) Prepare dataset for analysis
- Focus on non-cancelled flights
- Remove extreme outliers (> 6 hours)
- Drop duplicates

In [None]:
delay_df = working[working['Cancelled'] == 0].copy()
delay_df = delay_df[delay_df['Delay'].abs() < 360].copy()
before = delay_df.shape[0]
delay_df.drop_duplicates(inplace=True)
print('Dropped duplicates:', before - delay_df.shape[0])
print('Final rows for analysis:', delay_df.shape)


## 6) Descriptive statistics and carrier summary

In [None]:
display(delay_df['Delay'].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99]))
if 'Carrier' in delay_df.columns:
    carrier_stats = (
        delay_df.groupby('Carrier')['Delay']
        .agg(['count','mean','median','std','min','max'])
        .sort_values('count', ascending=False)
    )
    display(carrier_stats.head(10))


## 7) Visualizations (histogram, boxplot, heatmap)
Plots are saved under `results/plots/`.

In [None]:
def save_fig(fig, fname):
    path = os.path.join(PLOTS_DIR, fname)
    fig.savefig(path, dpi=150, bbox_inches='tight')
    plt.close(fig)
    return path

# Delay histogram
fig = plt.figure()
sns.histplot(delay_df['Delay'], bins=100, kde=True)
plt.title('Delay distribution (minutes)')
plt.xlabel('Delay (minutes)')
plt.xlim(-60, 300)
hist_path = save_fig(fig, 'delay_distribution.png')
print('Saved:', hist_path)

# Boxplot for top carriers
if 'Carrier' in delay_df.columns:
    top_carriers = delay_df['Carrier'].value_counts().nlargest(8).index.tolist()
    subset = delay_df[delay_df['Carrier'].isin(top_carriers)]
    fig = plt.figure(figsize=(12,6))
    sns.boxplot(x='Carrier', y='Delay', data=subset, order=top_carriers)
    plt.ylim(-60, 300)
    box_path = save_fig(fig, 'airline_comparison_boxplot.png')
    print('Saved:', box_path)

# Correlation heatmap
numeric_cols = [c for c in ('Delay','Distance','Month','Day') if c in delay_df.columns]
fig = plt.figure()
sns.heatmap(delay_df[numeric_cols].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
heatmap_path = save_fig(fig, 'correlation_heatmap.png')
print('Saved:', heatmap_path)


## 8) Hypothesis testing: two-sample t-test and one-tailed
Compare mean delays between the top two carriers (by count).

In [None]:
tt_test_summary = {}
if 'Carrier' in delay_df.columns and delay_df['Carrier'].nunique() >= 2:
    top_two = delay_df['Carrier'].value_counts().nlargest(2).index.tolist()
    c1, c2 = top_two[0], top_two[1]
    d1 = delay_df[delay_df['Carrier'] == c1]['Delay'].dropna()
    d2 = delay_df[delay_df['Carrier'] == c2]['Delay'].dropna()
    # sample for speed
    max_n = 5000
    if len(d1) > max_n:
        d1 = d1.sample(max_n, random_state=1)
    if len(d2) > max_n:
        d2 = d2.sample(max_n, random_state=1)
    t_stat, p_two = stats.ttest_ind(d1, d2, equal_var=False, nan_policy='omit')
    p_one = p_two/2 if t_stat < 0 else 1 - p_two/2
    tt_test_summary = {'carrier_a': c1, 'carrier_b': c2, 't_stat': float(t_stat), 'p_two': float(p_two), 'p_one': float(p_one)}
    print('T-test summary:', tt_test_summary)
else:
    print('Not enough carriers to run t-test')


## 9) Regression models
- Simple: Delay ~ Distance
- Multiple: Delay ~ Distance + Month + Carrier (one-hot top carriers)

In [None]:
regression_info = {}
# Simple regression
if 'Distance' in delay_df.columns:
    X = delay_df[['Distance']].values.reshape(-1,1)
    y = delay_df['Delay'].values
    idx = np.random.RandomState(0).choice(np.arange(len(X)), size=min(20000, len(X)), replace=False)
    Xs, ys = X[idx], y[idx]
    X_tr, X_te, y_tr, y_te = train_test_split(Xs, ys, test_size=0.2, random_state=42)
    lr = LinearRegression().fit(X_tr, y_tr)
    y_pred = lr.predict(X_te)
    regression_info['simple'] = {'coef': float(lr.coef_[0]), 'intercept': float(lr.intercept_), 'r2': float(r2_score(y_te, y_pred))}
    fig = plt.figure()
    plt.scatter(X_te[:,0], y_te, alpha=0.2, s=10)
    x_line = np.linspace(X_te.min(), X_te.max(), 100)
    plt.plot(x_line, lr.intercept_ + lr.coef_[0]*x_line, color='red')
    save_fig(fig, 'regression_fit_distance.png')

# Multiple regression
if 'Carrier' in delay_df.columns:
    sample = delay_df.sample(n=min(20000, len(delay_df)), random_state=2)
    X_multi = pd.DataFrame()
    if 'Distance' in sample.columns:
        X_multi['Distance'] = sample['Distance']
    if 'Month' in sample.columns:
        X_multi['Month'] = sample['Month']
    top_n = 8
    top_carriers = sample['Carrier'].value_counts().nlargest(top_n).index.tolist()
    sample['Carrier_top'] = sample['Carrier'].where(sample['Carrier'].isin(top_carriers), 'OTHER')
    dummies = pd.get_dummies(sample['Carrier_top'], prefix='Carrier', drop_first=True)
    X_multi = pd.concat([X_multi, dummies], axis=1)
    y_multi = sample['Delay'].values
    X_tr, X_te, y_tr, y_te = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)
    lr_multi = LinearRegression().fit(X_tr, y_tr)
    y_pred_m = lr_multi.predict(X_te)
    regression_info['multiple'] = {'r2': float(r2_score(y_te, y_pred_m)), 'mse': float(mean_squared_error(y_te, y_pred_m))}
    print('Regression results:', regression_info)


## 10) Save cleaned dataset, plots, and summary

In [None]:
delay_df.to_csv(os.path.join(CLEANED_DIR, 'clean_flights.csv'), index=False)

summary_lines = [
    'Airline On-Time Performance - Part A',
    f'Generated: {datetime.utcnow().isoformat()} UTC',
    f'Original shape: {df.shape}',
    f'Rows after cleaning: {delay_df.shape}',
    '',
    'Delay descriptive statistics:',
    desc_delay.to_string(),
    '',
    'T-test summary:',
    str(tt_test_summary),
    '',
    'Regression summary:',
    str(regression_info)
]
with open(os.path.join(SUMMARY_DIR, 'airline_summary.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(summary_lines))

print('Saved cleaned CSV and summary. Check the results/ folder for plots and outputs.')


### End of notebook

If you want this notebook saved directly into your GitHub repo, download it (File > Download .ipynb) and add it to your repo, or I can provide the commit commands to run locally.