In [2]:
# Librerías
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [3]:
df = pd.read_csv("data/Airplane_Crashes_and_Fatalities_Since_1908.csv")
print("Dimensiones iniciales:", df.shape)
df.info()

Dimensiones iniciales: (5268, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268 entries, 0 to 5267
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         5268 non-null   int64  
 1   Date          5268 non-null   object 
 2   Time          3049 non-null   object 
 3   Location      5248 non-null   object 
 4   Operator      5250 non-null   object 
 5   Flight #      1069 non-null   object 
 6   Route         3561 non-null   object 
 7   Type          5241 non-null   object 
 8   Registration  4933 non-null   object 
 9   cn/In         4040 non-null   object 
 10  Aboard        5246 non-null   float64
 11  Fatalities    5256 non-null   float64
 12  Ground        5246 non-null   float64
 13  Summary       4878 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 576.3+ KB


# # 2. Exploratory Data Analysis (EDA)

In [None]:
# Estadísticas básicas
print("\n=== Estadísticas descriptivas ===")
print(df.describe())

print("\n=== Estadísticas categóricas ===")
print(df.describe(include='object'))

print("\n=== Valores únicos por columna ===")
print(df.nunique())


=== Estadísticas descriptivas ===
            index       Aboard   Fatalities       Ground
count  5268.00000  5246.000000  5256.000000  5246.000000
mean   2633.50000    27.554518    20.068303     1.608845
std    1520.88494    43.076711    33.199952    53.987827
min       0.00000     0.000000     0.000000     0.000000
25%    1316.75000     5.000000     3.000000     0.000000
50%    2633.50000    13.000000     9.000000     0.000000
75%    3950.25000    30.000000    23.000000     0.000000
max    5267.00000   644.000000   583.000000  2750.000000

=== Estadísticas categóricas ===
              Date   Time           Location  Operator Flight #     Route  \
count         5268   3049               5248      5250     1069      3561   
unique        4753   1005               4303      2476      724      3243   
top     09/11/2001  15:00  Sao Paulo, Brazil  Aeroflot        -  Training   
freq             4     32                 15       179       67        81   

                Type Registratio

## Completitud

In [5]:
print("\n=== Valores nulos ===")
print(df.isnull().sum())

total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"\nPorcentaje global de valores faltantes: {round((total_missing / total_cells) * 100, 2)}%")

# Filas con más de 50% valores nulos
rows_many_missing = df[df.isnull().sum(axis=1) > (df.shape[1] / 2)]
print(f"Filas con más del 50% nulos: {rows_many_missing.shape[0]}")

# %% [markdown]
# ## 2.2 Fechas y tiempos


=== Valores nulos ===
index              0
Date               0
Time            2219
Location          20
Operator          18
Flight #        4199
Route           1707
Type              27
Registration     335
cn/In           1228
Aboard            22
Fatalities        12
Ground            22
Summary          390
dtype: int64

Porcentaje global de valores faltantes: 13.83%
Filas con más del 50% nulos: 6


In [6]:
# Convertir a datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

fecha_min, fecha_max = pd.to_datetime('1908-01-01'), pd.to_datetime('2025-09-17')
invalid_dates = df[(df['Date'] < fecha_min) | (df['Date'] > fecha_max) | df['Date'].isna()]

print(f"\nFechas inválidas: {len(invalid_dates)}")
print("Fecha mínima:", df['Date'].min())
print("Fecha máxima:", df['Date'].max())

# Validar formato de hora HH:MM
invalid_time = df['Time'].dropna()[~df['Time'].dropna().str.match(r'^\d{2}:\d{2}$')]
print(f"Tiempos con formato inválido: {len(invalid_time)}")

# %% [markdown]
# ## 2.3 Valores atípicos


Fechas inválidas: 0
Fecha mínima: 1908-09-17 00:00:00
Fecha máxima: 2009-06-08 00:00:00
Tiempos con formato inválido: 19


In [7]:
numeric_cols = ['Aboard', 'Fatalities', 'Ground']

for col in numeric_cols:
    neg_count = (df[col] < 0).sum()
    print(f"{col}: negativos={neg_count}, máximo={df[col].max()}")

# Fatalities no puede ser mayor que Aboard
invalid_fatalities = df[df['Fatalities'] > df['Aboard']]
print(f"Filas con Fatalities > Aboard: {len(invalid_fatalities)}")

# %% [markdown]
# ## 2.4 Duplicados

Aboard: negativos=0, máximo=644.0
Fatalities: negativos=0, máximo=583.0
Ground: negativos=0, máximo=2750.0
Filas con Fatalities > Aboard: 0


In [8]:
exact_dups = df.duplicated().sum()
print(f"Duplicados exactos: {exact_dups}")

key_cols = ['Date', 'Location', 'Operator', 'Type']
dup_key = df.duplicated(subset=key_cols, keep=False).sum()
print(f"Duplicados por clave {key_cols}: {dup_key}")

# %% [markdown]
# # 3. Limpieza de Datos

# %% [markdown]
# ## 3.1 Eliminar columnas innecesarias

Duplicados exactos: 0
Duplicados por clave ['Date', 'Location', 'Operator', 'Type']: 0


In [9]:
df.drop(columns=['Flight #'], inplace=True, errors='ignore')
print("Columna 'Flight #' eliminada.")

# %% [markdown]
# ## 3.2 Limpiar columna Time

Columna 'Flight #' eliminada.


In [10]:
def clean_time(time_str):
    if pd.isna(time_str) or time_str.strip() == '':
        return 'Not Specified'
    time_str = time_str.strip()
    if re.match(r'^\d{1}:\d{2}$', time_str):  # 7:30 -> 07:30
        return '0' + time_str
    return time_str if re.match(r'^\d{2}:\d{2}$', time_str) else 'Invalid'

df['Time_clean'] = df['Time'].apply(clean_time)

# Reemplazar "Not Specified" por la moda
time_mode = df[df['Time_clean'] != 'Not Specified']['Time_clean'].mode()[0]
df['Time_clean'] = df['Time_clean'].replace('Not Specified', time_mode)

# %% [markdown]
# ## 3.3 Normalizar texto categórico

In [11]:
def normalize_text(text):
    return str(text).strip().lower() if pd.notna(text) and text.strip() != '' else 'not specified'

category_cols = ['Location', 'Operator', 'Route', 'Type', 'Registration', 'cn/In', 'Summary']
df[category_cols] = df[category_cols].applymap(normalize_text)

# %% [markdown]
# ## 3.4 Estandarizar Operator y Location

  df[category_cols] = df[category_cols].applymap(normalize_text)


In [12]:
def clean_operator(op):
    mappings = {
        r'aeroflot': 'Aeroflot',
        r'united airlines': 'United Airlines',
        r'military - u.s. army|military - u.s. air force': 'Military USA',
        r'air france': 'Air France',
        r'american airlines': 'American Airlines',
        r'pan am': 'Pan Am'
    }
    for pattern, replacement in mappings.items():
        op = re.sub(pattern, replacement, op, flags=re.IGNORECASE)
    return op.title()

df['Operator'] = df['Operator'].apply(clean_operator)

# Limpiar Location
def clean_location(loc):
    loc = re.sub(r'\(.*?\)|\[.*?\]', '', loc)
    loc = re.sub(r'\s+', ' ', loc).strip()
    return loc.title()

df['Location'] = df['Location'].apply(clean_location)

# %% [markdown]
# ## 3.5 Manejar valores nulos numéricos

In [13]:
df['Ground'] = df['Ground'].fillna(0)

# Medianas por tipo
aboard_median = df.groupby('Type')['Aboard'].transform('median')
df['Aboard'] = df['Aboard'].fillna(aboard_median).fillna(df['Aboard'].median())

fatalities_median = df.groupby('Type')['Fatalities'].transform('median')
df['Fatalities'] = df['Fatalities'].fillna(fatalities_median).fillna(df['Fatalities'].median())

# %% [markdown]
# ## 3.6 Ajustar tipos finales

In [14]:
df['Aboard'] = df['Aboard'].astype(int)
df['Fatalities'] = df['Fatalities'].astype(int)
df['Ground'] = df['Ground'].astype(int)

# %% [markdown]
# ## 3.7 Columnas derivadas

In [15]:
df['Year'] = df['Date'].dt.year
df['Decade'] = (df['Year'] // 10) * 10

# %% [markdown]
# # 4. Dataset final limpio

In [16]:
df_final = df[['Index', 'Date', 'Time_clean', 'Location', 'Operator', 'Route',
               'Type', 'Registration', 'cn/In', 'Aboard', 'Fatalities', 'Ground',
               'Summary', 'Year', 'Decade']].copy()

df_final.info()
print("\nValores nulos finales:\n", df_final.isnull().sum())

KeyError: "['Index'] not in index"

In [None]:
df_final.head()