In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import missingno as msno
import warnings
warnings.filterwarnings('ignore')

# Set visualization styles
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# For displaying all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


✅ All libraries imported successfully!
Pandas version: 2.1.4
NumPy version: 1.26.2


In [6]:
print("📂 Loading dataset... (this may take a few minutes)")

df = pd.read_csv('data/flights_sample_3m.csv', nrows=500000)  

print(f"\n✅ Dataset loaded successfully!")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

📂 Loading dataset... (this may take a few minutes)

✅ Dataset loaded successfully!
Shape: 500,000 rows × 32 columns
Memory usage: 362.44 MB


In [4]:
print("=" * 80)
print("INITIAL DATA EXPLORATION")
print("=" * 80)

# Display first few rows
print("\n📊 First 5 rows:")
display(df.head())

# Column names and types
print("\n📋 Column Information:")
display(df.info())

# Basic statistics
print("\n📈 Statistical Summary:")
display(df.describe())

INITIAL DATA EXPLORATION

📊 First 5 rows:


Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",1155,1151.0,-4.0,19.0,1210.0,1443.0,4.0,1501,1447.0,-14.0,0.0,,0.0,186.0,176.0,153.0,1065.0,,,,,
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",2120,2114.0,-6.0,9.0,2123.0,2232.0,38.0,2315,2310.0,-5.0,0.0,,0.0,235.0,236.0,189.0,1399.0,,,,,
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",954,1000.0,6.0,20.0,1020.0,1247.0,5.0,1252,1252.0,0.0,0.0,,0.0,118.0,112.0,87.0,680.0,,,,,
3,2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",1609,1608.0,-1.0,27.0,1635.0,1844.0,9.0,1829,1853.0,24.0,0.0,,0.0,260.0,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0
4,2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",1840,1838.0,-2.0,15.0,1853.0,2026.0,14.0,2041,2040.0,-1.0,0.0,,0.0,181.0,182.0,153.0,985.0,,,,,



📋 Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   FL_DATE                  500000 non-null  object 
 1   AIRLINE                  500000 non-null  object 
 2   AIRLINE_DOT              500000 non-null  object 
 3   AIRLINE_CODE             500000 non-null  object 
 4   DOT_CODE                 500000 non-null  int64  
 5   FL_NUMBER                500000 non-null  int64  
 6   ORIGIN                   500000 non-null  object 
 7   ORIGIN_CITY              500000 non-null  object 
 8   DEST                     500000 non-null  object 
 9   DEST_CITY                500000 non-null  object 
 10  CRS_DEP_TIME             500000 non-null  int64  
 11  DEP_TIME                 487056 non-null  float64
 12  DEP_DELAY                487051 non-null  float64
 13  TAXI_OUT                 486862 non-

None


📈 Statistical Summary:


Unnamed: 0,DOT_CODE,FL_NUMBER,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
count,500000.0,500000.0,500000.0,487056.0,487051.0,486862.0,486862.0,486680.0,486680.0,500000.0,486680.0,485652.0,500000.0,500000.0,499996.0,485652.0,485652.0,500000.0,89232.0,89232.0,89232.0,89232.0,89232.0
mean,19977.089464,2509.14259,1327.153168,1330.197382,10.150808,16.642778,1352.717604,1463.274147,7.677363,1491.259396,1467.30772,4.279801,0.02639,0.002306,142.207562,136.565405,112.258938,808.76389,24.588858,3.869946,13.214351,0.150641,25.553613
std,377.056453,1744.904379,485.721885,499.16961,49.113391,9.209484,500.75211,526.848409,6.246828,511.09177,531.417399,51.052173,0.160292,0.047965,71.43043,71.560149,69.644297,586.757193,70.931907,32.405801,33.483004,3.564969,55.991135
min,19393.0,1.0,1.0,1.0,-68.0,1.0,1.0,1.0,1.0,1.0,1.0,-84.0,0.0,0.0,20.0,16.0,8.0,29.0,0.0,0.0,0.0,0.0,0.0
25%,19790.0,1051.0,915.0,916.0,-6.0,11.0,931.0,1050.0,4.0,1108.0,1053.0,-16.0,0.0,0.0,90.0,84.0,61.0,377.0,0.0,0.0,0.0,0.0,0.0
50%,19930.0,2150.0,1320.0,1323.0,-2.0,14.0,1336.0,1502.0,6.0,1517.0,1505.0,-7.0,0.0,0.0,125.0,120.0,95.0,651.0,4.0,0.0,0.0,0.0,0.0
75%,20368.0,3790.0,1730.0,1739.0,6.0,19.0,1752.0,1909.0,9.0,1919.0,1914.0,7.0,0.0,0.0,172.0,167.0,142.0,1045.0,23.0,0.0,17.0,0.0,30.0
max,20452.0,8815.0,2359.0,2400.0,2565.0,184.0,2400.0,2400.0,240.0,2400.0,2400.0,2556.0,1.0,1.0,685.0,722.0,661.0,5095.0,2556.0,1398.0,1468.0,291.0,2010.0


In [5]:
print(df.shape)



(500000, 32)


In [7]:
print("\n📝 All Column Names:")
print(df.columns.tolist())

# Save column names for reference
columns = df.columns.tolist()


📝 All Column Names:
['FL_DATE', 'AIRLINE', 'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE', 'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY', 'DEST', 'DEST_CITY', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']
