In [117]:
import pandas as pd
import numpy as np
from datetime import datetime
import re 

# Exploratory Data Analysis

In [118]:
df = pd.read_csv("data/Airplane_Crashes_and_Fatalities_Since_1908.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268 entries, 0 to 5267
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         5268 non-null   int64  
 1   Date          5268 non-null   object 
 2   Time          3049 non-null   object 
 3   Location      5248 non-null   object 
 4   Operator      5250 non-null   object 
 5   Flight #      1069 non-null   object 
 6   Route         3561 non-null   object 
 7   Type          5241 non-null   object 
 8   Registration  4933 non-null   object 
 9   cn/In         4040 non-null   object 
 10  Aboard        5246 non-null   float64
 11  Fatalities    5256 non-null   float64
 12  Ground        5246 non-null   float64
 13  Summary       4878 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 576.3+ KB


Unnamed: 0,index,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary
0,0,09/17/1908,17:18,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1.0,2.0,1.0,0.0,"During a demonstration flight, a U.S. Army fly..."
1,1,07/12/1912,06:30,"AtlantiCity, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...
2,2,08/06/1913,,"Victoria, British Columbia, Canada",Private,-,,Curtiss seaplane,,,1.0,1.0,0.0,The first fatal airplane accident in Canada oc...
3,3,09/09/1913,18:30,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,14.0,0.0,The airship flew into a thunderstorm and encou...
4,4,10/17/1913,10:30,"Near Johannisthal, Germany",Military - German Navy,,,Zeppelin L-2 (airship),,,30.0,30.0,0.0,Hydrogen gas which was being vented was sucked...


In [119]:
# Número de filas y columnas
print("Dimensiones:", df.shape)

Dimensiones: (5268, 14)


In [120]:
# Estadísticas descriptivas
print(df.describe())

            index       Aboard   Fatalities       Ground
count  5268.00000  5246.000000  5256.000000  5246.000000
mean   2633.50000    27.554518    20.068303     1.608845
std    1520.88494    43.076711    33.199952    53.987827
min       0.00000     0.000000     0.000000     0.000000
25%    1316.75000     5.000000     3.000000     0.000000
50%    2633.50000    13.000000     9.000000     0.000000
75%    3950.25000    30.000000    23.000000     0.000000
max    5267.00000   644.000000   583.000000  2750.000000


In [121]:
# Conteo de valores por columna
print(df.nunique())

index           5268
Date            4753
Time            1005
Location        4303
Operator        2476
Flight #         724
Route           3243
Type            2446
Registration    4905
cn/In           3707
Aboard           239
Fatalities       191
Ground            50
Summary         4673
dtype: int64


In [122]:
# Estadísticas también para categóricas
print(df.describe(include='object'))

              Date   Time           Location  Operator Flight #     Route  \
count         5268   3049               5248      5250     1069      3561   
unique        4753   1005               4303      2476      724      3243   
top     09/11/2001  15:00  Sao Paulo, Brazil  Aeroflot        -  Training   
freq             4     32                 15       179       67        81   

                Type Registration cn/In                  Summary  
count           5241         4933  4040                     4878  
unique          2446         4905  3707                     4673  
top     Douglas DC-3           49   178  Crashed during takeoff.  
freq             334            3     6                       15  


# Puntaje

In [123]:
total_rows = df.shape[0]
total_values = df.shape[0] * df.shape[1]
# Completitud
non_null_values = df.notnull().sum().sum()
completitud_score = (non_null_values / total_values) * 100
print(f"Completitud: {completitud_score:.2f}%")

# Validez
valid_fatalities = df[df['Fatalities'] <= df['Aboard']].shape[0]

valid_ground = df[df['Ground'] >= 0].shape[0]

valid_time = df['Time'].dropna().apply(lambda x: bool(re.match(r'^\d{1,2}:\d{2}$', str(x)))).sum()
total_time = df['Time'].notna().sum()

validez_score = ((valid_fatalities + valid_ground + valid_time) / (total_rows + total_rows + total_time)) * 100
print(f"Validez: {validez_score:.2f}%")

# Exactitud
valid_aboard = df[(df['Aboard'] >= 0) & (df['Aboard'] <= 1000)].shape[0]

valid_fatalities_count = df[df['Fatalities'] >= 0].shape[0]

exactitud_score = ((valid_aboard + valid_fatalities_count) / (2 * total_rows)) * 100
print(f"Exactitud: {exactitud_score:.2f}%")

# Consistencia
total_rows = len(df)

rule_1 = (df['Fatalities'] <= df['Aboard'])
rule_2 = (df['Ground'] >= 0)
rule_3 = ((df['Aboard'] == 0) | df['Fatalities'].notna())
def valid_time(x):
    return pd.isna(x) or bool(re.match(r'^\d{1,2}:\d{2}$', str(x)))

rule_4 = df['Time'].apply(valid_time)

total_rules = rule_1.sum() + rule_2.sum() + rule_3.sum() + rule_4.sum()

max_rules = total_rows * 4

consistencia_score = (total_rules / max_rules) * 100
print(f"Consistencia: {consistencia_score:.2f}%")

# Unicidad
unique_rows = df.drop_duplicates().shape[0]
unicidad_score = (unique_rows / total_rows) * 100
print(f"Unicidad: {unicidad_score:.2f}%")

# Oportunidad
date = df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
date = date.dt.year
decade_counts = df.groupby((date// 10) * 10).size()

last_decade = decade_counts.iloc[-1]
max_decade = decade_counts.max()

timeliness_score = (last_decade / max_decade) * 100
print(f"Oportunidad: {timeliness_score:.2f}%")
0
global_quality_score = (
    completitud_score +
    validez_score +
    exactitud_score +
    consistencia_score +
    unicidad_score +
    timeliness_score
) / 6

print("\n=== SCORE GLOBAL DE CALIDAD ===")
print(f"Calidad total de los datos: {global_quality_score:.2f}%")

Completitud: 86.17%
Validez: 99.58%
Exactitud: 99.68%
Consistencia: 99.67%
Unicidad: 100.00%
Oportunidad: 69.65%

=== SCORE GLOBAL DE CALIDAD ===
Calidad total de los datos: 92.46%


# Exploratory Quality Check

## Completitud

In [124]:
print("Total valores nulos por columna")
print(df.isnull().sum())

Total valores nulos por columna
index              0
Date               0
Time            2219
Location          20
Operator          18
Flight #        4199
Route           1707
Type              27
Registration     335
cn/In           1228
Aboard            22
Fatalities        12
Ground            22
Summary          390
dtype: int64


In [125]:
print("Porcentaje de valores nulos por columna")
print(df.isnull().mean() * 100)

Porcentaje de valores nulos por columna
index            0.000000
Date             0.000000
Time            42.122248
Location         0.379651
Operator         0.341686
Flight #        79.707669
Route           32.403189
Type             0.512528
Registration     6.359150
cn/In           23.310554
Aboard           0.417616
Fatalities       0.227790
Ground           0.417616
Summary          7.403189
dtype: float64


In [126]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()} valores únicos")

index: 5268 valores únicos
Date: 4753 valores únicos
Time: 1005 valores únicos
Location: 4303 valores únicos
Operator: 2476 valores únicos
Flight #: 724 valores únicos
Route: 3243 valores únicos
Type: 2446 valores únicos
Registration: 4905 valores únicos
cn/In: 3707 valores únicos
Aboard: 239 valores únicos
Fatalities: 191 valores únicos
Ground: 50 valores únicos
Summary: 4673 valores únicos


In [127]:
total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"Porcentaje global de valores faltantes: {round((total_missing / total_cells) * 100, 2)}%")

Porcentaje global de valores faltantes: 13.83%


In [128]:
threshold = 50  # ejemplo: 50% de valores nulos
high_missing = df.isnull().mean() * 100
print("Columnas con más del 50% de valores faltantes:")
print(high_missing[high_missing > threshold])

Columnas con más del 50% de valores faltantes:
Flight #    79.707669
dtype: float64


In [129]:
rows_many_missing = df[df.isnull().sum(axis=1) > (df.shape[1] / 2)]
print(f"Filas con más de la mitad de los valores faltantes: {rows_many_missing.shape[0]}")
rows_many_missing

Filas con más de la mitad de los valores faltantes: 6


Unnamed: 0,index,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary
26,26,1919-10-20,,English Channel,Aircraft Transport and Travel,,,De Havilland DH-4,G-EAHG,,,,,
138,138,1928-03-03,,"Rio de Janeiro, Brazil",,,,,,,10.0,10.0,0.0,
333,333,1934-08-10,,"Ningbo, China",China National Aviation Corporation,,,Sikorsky S-38B,,,,,,
423,423,1936-12-26,,"Nanking, China",China National Aviation Corporation,,,Douglas DC-2,NC14269,,,,,
573,573,1942-02-14,,,China National Aviation Corporation,,,Douglas DC-2,45,,,,,
678,678,1944-11-09,,"Seljord, Norway",Military - U.S. Army Air Corps,,,,42-52196,,,,,


## Precisión

In [130]:
#  'Date' to datetime, forcing error to NaT
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

#count invalid dates
invalid_dates_count = df['Date'].isna().sum()

print(f"number of invalid dates: {invalid_dates_count}")

print(f"\nValid Dates Range:")
print(f"Minimun Date: {df['Date'].min()}")
print(f"Maximiun Date: {df['Date'].max()}")

number of invalid dates: 0

Valid Dates Range:
Minimun Date: 1908-09-17 00:00:00
Maximiun Date: 2009-06-08 00:00:00


In [131]:
# Verificar fechas menores a 1908 o mayores a 2025
fecha_minima = pd.to_datetime('1908-01-01')
fechas_menores_1908 = df[df['Date'] < fecha_minima]
print(f"Número de fechas menores a 1908: {len(fechas_menores_1908)}")
fecha_maxima = pd.to_datetime('2025-09-17')
fechas_mayores_2025 = df[df['Date'] > fecha_maxima] 
print(f"Número de fechas mayores a 2025: {len(fechas_mayores_2025)}")


Número de fechas menores a 1908: 0
Número de fechas mayores a 2025: 0


In [132]:
#Time

print(f"Valores faltantes: {df['Time'].isna().sum()} ({df['Time'].isna().mean()*100:.1f}%)")

# Verify format HH:MM with regular expressions
weird_format = df['Time'].dropna()[~df['Time'].dropna().str.match(r'^\d{2}:\d{2}$')]
print(f"Format not HH:MM: {len(weird_format)}")


Valores faltantes: 2219 (42.1%)
Format not HH:MM: 19


In [133]:
numeric_cols = ['Aboard', 'Fatalities', 'Ground']
for col in numeric_cols:
    invalid_negatives = df[df[col] < 0]
    if not invalid_negatives.empty:
        print(f"\n{col} tiene valores negativos: {len(invalid_negatives)}")
    
    print(f"{col} - Máximo valor encontrado: {df[col].max()}")

Aboard - Máximo valor encontrado: 644.0
Fatalities - Máximo valor encontrado: 583.0
Ground - Máximo valor encontrado: 2750.0


In [134]:
invalid_fatalities = df[df['Fatalities'] > df['Aboard']]
print(f"Registros donde Fatalities > Aboard: {len(invalid_fatalities)}")

Registros donde Fatalities > Aboard: 0


In [135]:
ground_invalid = df[df['Ground'] < 0]
print(f"Registros con Ground negativo: {len(ground_invalid)}")

Registros con Ground negativo: 0


In [136]:
high_ground = df[df['Ground'] > df['Aboard'] * 5]  # Umbral ejemplo
print(f"Registros con Ground extremadamente alto: {len(high_ground)}")

Registros con Ground extremadamente alto: 34


## Consistencia

In [137]:
# 1. Fatalities no puede ser mayor a Aboard
inconsist_fatalities = df[df['Fatalities'] > df['Aboard']]

# 2. Aboard = 0 pero Fatalities > 0
inconsist_aboard_zero = df[(df['Aboard'] == 0) & (df['Fatalities'] > 0)]

# 3. Ground no puede ser negativo
inconsist_ground = df[df['Ground'] < 0]

# 4. Mostrar cantidades
print("Registros con Fatalities > Aboard:", inconsist_fatalities.shape[0])
print("Registros con Aboard = 0 y Fatalities > 0:", inconsist_aboard_zero.shape[0])
print("Registros con Ground negativo:", inconsist_ground.shape[0])

Registros con Fatalities > Aboard: 0
Registros con Aboard = 0 y Fatalities > 0: 0
Registros con Ground negativo: 0


In [138]:
category_formats = ['Location', 'Operator', 'Type', 'Route', 'Registration', 'cn/In']

# 2. Function to find variations
def find_variations(column_name, threshold=10):
    print(f"\n=== VARIATIONS IN {column_name.upper()} ===")
    count = df[column_name].value_counts()
    
    # Show values with few occurrences (potential variations)
    variations = count[count <= threshold]
    print(f"Values with less than {threshold} occurrences:")
    print(variations)
    
    return variations

# Review variations in all columns
for column in category_formats:
    find_variations(column, threshold=5)

print("=== ROUTE COLUMN ANALYSIS ===")
print("Unique values in Route (top 15):")
print(df['Route'].value_counts().head(15))

print("=== TYPE COLUMN ANALYSIS ===") 
print("Unique values in Type (top 15):")
print(df['Type'].value_counts().head(15))


=== VARIATIONS IN LOCATION ===
Values with less than 5 occurrences:
Location
Cheyenne, Wyoming                    5
Near Mexico City, Mexico             5
Medellin, Colombia                   5
Quito, Ecuador                       5
Seattle, Washington                  5
                                    ..
Near Charana, Bolivia                1
Monte Matto, Italy                   1
Misaki Mountain, Japan               1
Angelholm, Sweden                    1
State of Arunachal Pradesh, India    1
Name: count, Length: 4264, dtype: int64

=== VARIATIONS IN OPERATOR ===
Values with less than 5 occurrences:
Operator
Military - Indonesian Air Force        5
Aeronaves de Mexico                    5
Syndicato Condor                       5
LANSA                                  5
Military - Nicaraguan Air Force        5
                                      ..
Military - Argentine Navy              1
Richland Flying Service - Air Taxii    1
Harbor Airlines - Air Taxi             1
Aerovi

In [139]:
df_temp = df.copy()

for col in category_formats:
    df_temp[col] = df_temp[col].astype(str).str.lower().str.strip()

weird_locations = df_temp[df_temp['Location'].notna() & ~df_temp['Location'].str.match(r"^[a-zà-ÿ\s\-,]+$")]
print(f"Ubicaciones con caracteres inválidos: {len(weird_locations)}")

Ubicaciones con caracteres inválidos: 172


In [140]:
weird_operators = df_temp[df_temp['Operator'].notna() & ~df_temp['Operator'].str.match(r"^[a-z0-9\s\-\&\.,]+$")]
print(f"Operadores con caracteres inválidos: {len(weird_operators)}")

Operadores con caracteres inválidos: 345


In [141]:
for col in ['Registration', 'cn/In']:
    duplicated_ids = df_temp[df_temp.duplicated(subset=[col], keep=False)]
    print(f"{col} con posibles duplicados: {len(duplicated_ids)}")

Registration con posibles duplicados: 390
cn/In con posibles duplicados: 1816


In [142]:
invalid_registration = df_temp[df_temp['Registration'].notna() & ~df_temp['Registration'].str.match(r"^[a-z]{0,2}-?[0-9]{3,6}$")]
print(f"Registros con formato inválido en 'Registration': {len(invalid_registration)}")

Registros con formato inválido en 'Registration': 4237


## Unicidad

In [143]:
# --- Revisar duplicados exactos en todo el dataset ---
n_exact_dups = df.duplicated().sum()
print(f"Duplicados EXACTOS en todo el registro: {n_exact_dups}")

# --- Revisar duplicados por clave lógica (Date + Location + Operator + Type) ---
key_cols = ['Date', 'Location', 'Operator', 'Type']
is_dup_key = df.duplicated(subset=key_cols, keep=False)

n_dup_key = is_dup_key.sum()
print(f"Duplicados por clave {key_cols}: {n_dup_key}")


Duplicados EXACTOS en todo el registro: 0
Duplicados por clave ['Date', 'Location', 'Operator', 'Type']: 0


## Validez

In [144]:
print("=== VALIDITY CHECKS ===")
invalid_dates_count = df['Date'].isna().sum()
print(f"Fechas inválidas convertidas a NaT: {invalid_dates_count}")
# --- 1. Date validity ---
# Fechas fuera del rango permitido
fecha_minima = pd.to_datetime('1908-01-01')
fecha_maxima = pd.to_datetime('2025-09-17')
invalid_dates = df[(df['Date'] < fecha_minima) | (df['Date'] > fecha_maxima) | (df['Date'].isna())]
print(f"Fechas inválidas (NaT o fuera de rango): {len(invalid_dates)}")

# --- 2. Time validity ---
# Validar formato HH:MM (24 horas)
invalid_time_format = df['Time'].dropna()[~df['Time'].dropna().str.match(r'^[0-2]\d:[0-5]\d$')]
print(f"Tiempos con formato inválido: {len(invalid_time_format)}")


=== VALIDITY CHECKS ===
Fechas inválidas convertidas a NaT: 0
Fechas inválidas (NaT o fuera de rango): 0
Tiempos con formato inválido: 19


## Timeliness

In [145]:
print("\n=== TIMELINESS CHECKS ===")

# 1. Fechas mínima y máxima en el dataset
min_date = df['Date'].min()
max_date = df['Date'].max()
print(f"Rango de fechas en el dataset: {min_date} a {max_date}")

# 2. Revisar cuántos registros están en el último año
import datetime
today = pd.to_datetime('today')

one_year_ago = today - pd.DateOffset(years=1)
recent_records = df[df['Date'] >= one_year_ago]

print(f"Registros en el último año ({one_year_ago.date()} a {today.date()}): {len(recent_records)}")

# 3. Conteo de registros por año para detectar posibles años sin datos
yearly_counts = df['Date'].dt.year.value_counts().sort_index()

print("\nRegistros por año:")
print(yearly_counts)

# Detectar años con pocos registros (posible problema de carga)
threshold = 5
low_activity_years = yearly_counts[yearly_counts < threshold]

print("\nAños con menos de 5 registros:")
print(low_activity_years)


=== TIMELINESS CHECKS ===
Rango de fechas en el dataset: 1908-09-17 00:00:00 a 2009-06-08 00:00:00
Registros en el último año (2024-09-17 a 2025-09-17): 0

Registros por año:
Date
1908     1
1912     1
1913     3
1915     2
1916     5
        ..
2005    51
2006    49
2007    54
2008    62
2009    24
Name: count, Length: 98, dtype: int64

Años con menos de 5 registros:
Date
1908    1
1912    1
1913    3
1915    2
1918    4
Name: count, dtype: int64


# Cleaning

In [146]:
df.drop(columns=['Flight #'], inplace=True)
print("Columna 'Flight #' eliminada.")

Columna 'Flight #' eliminada.


In [147]:
def clean_time(time_str):
    if pd.isna(time_str) or time_str.strip() == '':
        return 'Not Specified'
    
    # Validar formato HH:MM
    if re.match(r'^\d{2}:\d{2}$', time_str.strip()):
        return time_str.strip()
    
    # Si tiene formato como 7:30 -> 07:30
    if re.match(r'^\d{1}:\d{2}$', time_str.strip()):
        return '0' + time_str.strip()
    
    return 'Not Specified'

df['Time'] = df['Time'].apply(clean_time)

In [148]:
def normalize_text(text):
    if pd.isna(text) or str(text).strip() == '':
        return 'Not Specified'
    return str(text).strip().lower()

category_columns = ['Location', 'Operator', 'Route', 'Type', 'Registration', 'cn/In', 'Summary']
for col in category_columns:
    df[col] = df[col].apply(normalize_text)

# delete the spaces at the beginning and end
df['Type'] = df['Type'].str.strip()
df['Location'] = df['Location'].str.strip().str.title()
df['Operator'] = df['Operator'].str.strip().str.title()
df['Type'] = df['Type'].str.strip().str.lower()

In [149]:
def clean_operator(operator):
    if operator == 'Not Specified':
        return operator

    # Mapeo básico de nombres frecuentes
    operator_mapping = {
        r'aeroflot': 'Aeroflot',
        r'united airlines': 'United Airlines',
        r'military - u.s. army': 'Military - USA',
        r'military - u.s. air force': 'Military - USAF',
        r'air france': 'Air France',
        r'american airlines': 'American Airlines',
        r'pan am': 'Pan Am'
    }
    
    for pattern, replacement in operator_mapping.items():
        operator = re.sub(pattern, replacement, operator, flags=re.IGNORECASE)
    
    return operator.title()

df['Operator'] = df['Operator'].apply(clean_operator)

In [150]:
# Normalizamos nombres de columnas quitando espacios extra
df.columns = [c.strip() for c in df.columns]
# index,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary
df = df.rename(columns={'index':'Index','Flight #':'FlightNumber'})

In [151]:
def clean_location(location):
    """
    Automated cleaning for worldwide locations
    Maintains original format but removes common inconsistencies
    """
    if pd.isna(location):
        return location
    
    location = str(location)
    
    # Remove common prefixes and unnecessary words
    remove_patterns = [
        r'^near\s+', r'^approx\s+', r'^around\s+', r'^close to\s+',
        r'\s+\(.*?\)',  # Remove anything in parentheses
        r'\s+\[.*?\]',  # Remove anything in brackets
        r'\s+off\s+',   # Remove 'off' 
        r'\s+area$', r'\s+region$',  # Remove area/region suffixes
    ]
    
    for pattern in remove_patterns:
        location = re.sub(pattern, '', location, flags=re.IGNORECASE)
    
    # Standardize country names (minimal essential mapping)
    country_mapping = {
        r'USA|U\.S\.A\.|United States|U\.S\.': 'USA',
        r'UK|U\.K\.|United Kingdom|England|Britain': 'UK',
        r'Russia|USSR|Soviet Union': 'Russia',
        r'\.': '',  # Remove dots from abbreviations
    }
    
    for pattern, replacement in country_mapping.items():
        location = re.sub(pattern, replacement, location, flags=re.IGNORECASE)
    
    # Clean up formatting
    location = location.strip()
    location = re.sub(r'\s+', ' ', location)  # Remove extra spaces
    location = location.title()  # Title case
    
    return location

# Apply automated cleaning
df['Location'] = df['Location'].apply(clean_location)

print("Location cleaning completed!")

Location cleaning completed!


In [152]:
def categorize_route(route):
    """
    Minimal route categorization - just the essentials
    """
    if pd.isna(route) or (route_str := str(route).strip()) == '':
        return 'Not Specified'
    
    # Only handle the most common cases
    if route_str.lower() == 'not specified':
        return 'Not Specified'
    elif route_str.lower() == 'test':
        return 'Test Flight'
    elif route_str.lower() == 'test flight':
        return 'Test Flight'
    elif route_str.lower() in ['training', 'sightseeing', 'demonstration']:
        return route_str.title()
    
    # For routes, just do basic cleaning
    if '-' in route_str or ' to ' in route_str.lower():
        return re.sub(r'\s+to\s+', ' - ', route_str, flags=re.IGNORECASE).title()
    
    return route_str.title()


df['Route'] = df['Route'].apply(categorize_route)

In [153]:
def standardize_manufacturers(aircraft_type):
    """
    Standardize manufacturer names to consistent capitalization
    """
    if pd.isna(aircraft_type) or (type_str := str(aircraft_type).strip()) == '':
        return 'Not Specified'
    
    if type_str.lower() == 'not specified':
        return 'Not Specified'
    
    # Estandarizar nombres de fabricantes (similar al 'to' de rutas)
    manufacturer_standardization = {
        r'\bde havilland\b': 'De Havilland',
        r'\bdouglas\b': 'Douglas',
        r'\bboeing\b': 'Boeing', 
        r'\bairbus\b': 'Airbus',
        r'\bantonov\b': 'Antonov',
        r'\byakovlev\b': 'Yakovlev',
        r'\bjunkers\b': 'Junkers',
        r'\bcurtiss\b': 'Curtiss',
        r'\bbreguet\b': 'Breguet'
    }
    
    # Aplicar estandarización de fabricantes
    for pattern, replacement in manufacturer_standardization.items():
        type_str = re.sub(pattern, replacement, type_str, flags=re.IGNORECASE)
    
    return type_str

# Apply manufacturer standardization
df['Type'] = df['Type'].apply(standardize_manufacturers)

# Verify results
print("Manufacturer standardization completed!")
print("\nTop 15 standardized types:")
print(df['Type'].value_counts().head(15))

Manufacturer standardization completed!

Top 15 standardized types:
Type
Douglas dc-3                                334
De Havilland canada dhc-6 twin otter 300     81
Douglas c-47a                                74
Douglas c-47                                 62
Douglas dc-4                                 40
Antonov an-26                                38
Yakovlev yak-40                              38
Junkers ju-52/3m                             37
Douglas c-47b                                29
De Havilland dh-4                            28
Douglas dc-6b                                27
Not Specified                                27
Antonov an-12                                24
Breguet 14                                   23
Curtiss c-46a                                21
Name: count, dtype: int64


In [154]:
from datetime import datetime

# Convertir a datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Fechas futuras
today = datetime.today()
futuras = df[df['Date'] > today]
print("\nRegistros con fecha en el futuro:", len(futuras))


print("\nCantidad de registros por década:")
print(((df['Date'].dt.year // 10) * 10).value_counts().sort_index())


Registros con fecha en el futuro: 0

Cantidad de registros por década:
Date
1900      1
1910     27
1920    178
1930    323
1940    510
1950    596
1960    721
1970    837
1980    717
1990    775
2000    583
Name: count, dtype: int64


In [155]:
# Calcular la mediana de Aboard por tipo de avión
aboard_median_by_type = df.groupby('Type')['Aboard'].median()

# Llenar usando la mediana de su grupo
df['Aboard'] = df.apply(
    lambda row: aboard_median_by_type[row['Type']] if pd.isna(row['Aboard']) else row['Aboard'],
    axis=1
)
aboard_median_general = df['Aboard'].median()

df['Aboard'] = df['Aboard'].fillna(aboard_median_general)

In [156]:
df.loc[df['Fatalities'] > df['Aboard'], 'Fatalities'] = df['Aboard']
df.loc[(df['Aboard'] == 0) & (df['Fatalities'] > 0), 'Fatalities'] = 0
df.loc[df['Ground'] < 0, 'Ground'] = 0

In [157]:
df['Registration'] = df['Registration'].fillna('Not Specified').str.upper().str.strip()
df['cn/In'] = df['cn/In'].fillna('Not Specified').str.upper().str.strip()

In [158]:
# Crear un flag (0 = dato presente, 1 = dato faltante)
df['Flag_Missing_Ground'] = df['Ground'].isna().astype(int)
missing_ground_percentage = df['Flag_Missing_Ground'].mean() * 100
print(f"Porcentaje de registros con Ground faltante: {missing_ground_percentage:.2f}%")

Porcentaje de registros con Ground faltante: 0.42%


In [159]:
# Crear un flag (0 = dato presente, 1 = dato faltante)
df['Flag_Missing_Fatalities'] = df['Fatalities'].isna().astype(int)
missing_Fatalities_percentage = df['Flag_Missing_Fatalities'].mean() * 100
print(f"Porcentaje de registros con Fatalities faltante: {missing_Fatalities_percentage:.2f}%")

Porcentaje de registros con Fatalities faltante: 0.23%


In [160]:
df['Aboard'] = df['Aboard'].astype(int)
df['Fatalities'] = df['Fatalities'].fillna(0).astype(int)  

## Final Dataframe

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268 entries, 0 to 5267
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Index                    5268 non-null   int64         
 1   Date                     5268 non-null   datetime64[ns]
 2   Time                     5268 non-null   object        
 3   Location                 5268 non-null   object        
 4   Operator                 5268 non-null   object        
 5   Route                    5268 non-null   object        
 6   Type                     5268 non-null   object        
 7   Registration             5268 non-null   object        
 8   cn/In                    5268 non-null   object        
 9   Aboard                   5268 non-null   int64         
 10  Fatalities               5268 non-null   int64         
 11  Ground                   5246 non-null   float64       
 12  Summary                  5268 non-

In [162]:
print("\nResumen final de valores nulos:")
print(df.isnull().mean() * 100)



Resumen final de valores nulos:
Index                      0.000000
Date                       0.000000
Time                       0.000000
Location                   0.000000
Operator                   0.000000
Route                      0.000000
Type                       0.000000
Registration               0.000000
cn/In                      0.000000
Aboard                     0.000000
Fatalities                 0.000000
Ground                     0.417616
Summary                    0.000000
Flag_Missing_Ground        0.000000
Flag_Missing_Fatalities    0.000000
dtype: float64


In [163]:
df.head()

Unnamed: 0,Index,Date,Time,Location,Operator,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary,Flag_Missing_Ground,Flag_Missing_Fatalities
0,0,1908-09-17,17:18,"Fort Myer, Virginia",Military - Usa,Demonstration,wright flyer iii,NOT SPECIFIED,1,2,1,0.0,"during a demonstration flight, a u.s. army fly...",0,0
1,1,1912-07-12,06:30,"Atlanticity, New Jersey",Military - U.S. Navy,Test Flight,dirigible,NOT SPECIFIED,NOT SPECIFIED,5,5,0.0,first u.s. dirigible akron exploded just offsh...,0,0
2,2,1913-08-06,Not Specified,"Victoria, British Columbia, Canada",Private,Not Specified,Curtiss seaplane,NOT SPECIFIED,NOT SPECIFIED,1,1,0.0,the first fatal airplane accident in canada oc...,0,0
3,3,1913-09-09,18:30,Over The North Sea,Military - German Navy,Not Specified,zeppelin l-1 (airship),NOT SPECIFIED,NOT SPECIFIED,20,14,0.0,the airship flew into a thunderstorm and encou...,0,0
4,4,1913-10-17,10:30,"Johannisthal, Germany",Military - German Navy,Not Specified,zeppelin l-2 (airship),NOT SPECIFIED,NOT SPECIFIED,30,30,0.0,hydrogen gas which was being vented was sucked...,0,0


# Puntaje Final

In [164]:
df.to_csv("dataframe_final.csv", index=False, encoding='utf-8-sig')

In [165]:
df_clean = pd.read_csv("dataframe_final.csv", encoding='utf-8-sig')

In [166]:
import re
from datetime import datetime

# --- Variables iniciales ---
total_rows = df_clean.shape[0]
total_values = df_clean.shape[0] * df_clean.shape[1]

# --- Completitud ---
non_null_values = df_clean.notnull().sum().sum()
completitud_score = (non_null_values / total_values) * 100
print(f"Completitud: {completitud_score:.2f}%")

# --- Validez ---
valid_fatalities = df_clean[df_clean['Fatalities'] <= df_clean['Aboard']].shape[0]
valid_ground = df_clean[df_clean['Ground'] >= 0].shape[0]

# Aceptar 'Not Specified' como válido en Time
valid_time = df_clean['Time'].dropna().apply(
    lambda x: bool(re.match(r'^\d{1,2}:\d{2}$', str(x))) or x == 'Not Specified'
).sum()

total_time = df_clean['Time'].notna().sum()

validez_score = ((valid_fatalities + valid_ground + valid_time) / (total_rows + total_rows + total_time)) * 100
print(f"Validez: {validez_score:.2f}%")

# --- Exactitud ---
valid_aboard = df_clean[(df_clean['Aboard'] >= 0) & (df_clean['Aboard'] <= 1000)].shape[0]
valid_fatalities_count = df_clean[df_clean['Fatalities'] >= 0].shape[0]

exactitud_score = ((valid_aboard + valid_fatalities_count) / (2 * total_rows)) * 100
print(f"Exactitud: {exactitud_score:.2f}%")

# --- Consistencia ---
rule_1 = (df_clean['Fatalities'] <= df_clean['Aboard'])
rule_2 = (df_clean['Ground'] >= 0)
rule_3 = ((df_clean['Aboard'] == 0) & (df_clean['Fatalities'] == 0)) | ((df_clean['Aboard'] > 0) & df_clean['Fatalities'].notna())

def valid_time_func(x):
    return pd.isna(x) or bool(re.match(r'^\d{1,2}:\d{2}$', str(x))) or x == 'Not Specified'

rule_4 = df_clean['Time'].apply(valid_time_func)

total_rules = rule_1.sum() + rule_2.sum() + rule_3.sum() + rule_4.sum()
max_rules = total_rows * 4

consistencia_score = (total_rules / max_rules) * 100
print(f"Consistencia: {consistencia_score:.2f}%")

# --- Unicidad ---
unique_rows = df_clean.drop_duplicates().shape[0]
unicidad_score = (unique_rows / total_rows) * 100
print(f"Unicidad: {unicidad_score:.2f}%")

# --- Oportunidad ---
df_clean['Date'] = pd.to_datetime(df_clean['Date'], errors='coerce')
df_clean['Year'] = df_clean['Date'].dt.year

decade_counts = df_clean.groupby((df_clean['Year'] // 10) * 10).size()

last_decade = decade_counts.iloc[-1]
max_decade = decade_counts.max()

timeliness_score = (last_decade / max_decade) * 100
print(f"Oportunidad basada en última década: {timeliness_score:.2f}%")

# --- Score global ---
global_quality_score = (
    completitud_score +
    validez_score +
    exactitud_score +
    consistencia_score +
    unicidad_score +
    timeliness_score
) / 6

print("\n=== SCORE GLOBAL DE CALIDAD ===")
print(f"Calidad total de los datos: {global_quality_score:.2f}%")

Completitud: 99.97%
Validez: 99.86%
Exactitud: 100.00%
Consistencia: 99.90%
Unicidad: 100.00%
Oportunidad basada en última década: 69.65%

=== SCORE GLOBAL DE CALIDAD ===
Calidad total de los datos: 94.90%
