#### extración del primer dataset

In [60]:
# Importamos todas las librerías 
# [Import libraries]
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date, timedelta, datetime

In [63]:
# Leemos y hacemos una copia del dataset provisto por el Project Manager 
# [Read and make a copy of the dataset provide by our Project Manager]
flight_accident = pd.read_csv('AccidentesAviones.csv', delimiter=",")

## Análisis exploratorio de datos (EDA) - [Exploratory Data Analysis]

### Veamos cómo se compone el dataset - [Let´s see the data in our dataset]

In [64]:
#Tomamos algunos datos sin orden para ver como se compone. No elegimos .head() porque estan ordenados por fecha
# [We chose some samples to see the data. I didn't use .head() because is order by date, so could misslead the information]
flight_accident.sample(5)

Unnamed: 0.1,Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,flight_no,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
3857,3857,"October 26, 1993",1304,"Fuzhou, China",China Eastern Airlines,5398,Shenzen - Fuzhou,McDonnell Douglas MD-82,B-2103,49355,80,71,9,2,2,0,0,"While landing during a rainstorm, the aircraft..."
3356,3356,"October 11, 1985",2140,"Homer City, Pennsylvania",Mountain Air Cargo,?,State College - Pittsburgh,de Havilland Canada DHC-6 Twin Otter 200,N3257,192,1,0,1,1,0,1,0,The cargo plane struck high ground while en ro...
2725,2725,"January 06, 1974",1805,"Johnstown, Pennsylvania",Air East,?,"Pittsburgh, PA - Johnstown, PA",Beechcraft 99A,N125AE,U-125,17,15,2,12,11,1,0,Crashed short of the runway. Failed to mainta...
4972,4972,"May 18, 2018",1208,"Havana, Cuba",Cubana (leased from Global Air),972,Havana- Holguin,Boeing 737-201,XA-UHZ,21816/592,113,107,6,112,106,6,0,After taking off from runway 06 at Havana-José...
369,369,"April 06, 1935",1518,"Brilon, Germany",KLM Royal Dutch Airlines,?,Prague Leipzig Essen Amsterdam Rotterdam,Fokker F-12,PH-AFL,5242,7,2,5,7,2,5,0,Crashed after the pilot tried to maintain visu...


In [65]:
# Vamos a eliminar la columna 'Unnamed' ya que parece que se hizo una duplicidad del índice
# [The column ''Unnamed:0' seems to be an error. We take it out]
flight_accident.drop(['Unnamed: 0'], axis=1, inplace=True)

In [66]:
#[Check de dataset]
flight_accident

Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,flight_no,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
0,"September 17, 1908",1718,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2,1,1,1,1,0,0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1,0,1,1,0,0,0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",0630,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5,0,5,5,0,5,0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1,0,1,1,0,1,0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",1830,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20,?,?,14,?,?,0,The airship flew into a thunderstorm and encou...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5003,"March 28, 2021",1835,"Near Butte, Alaska",Soloy Helicopters,?,Sightseeing Charter,Eurocopter AS350B3 Ecureuil,N351SH,4598,6,5,1,5,4,1,0,The sightseeing helicopter crashed after missi...
5004,"May 21, 2021",1800,"Near Kaduna, Nigeria",Military - Nigerian Air Force,?,?,Beechcraft B300 King Air 350i,NAF203,FL-891,11,7,4,11,7,4,0,"While on final approach, in poor weather condi..."
5005,"June 10, 2021",0800,"Near Pyin Oo Lwin, Myanmar",Military - Myanmar Air Force,?,Naypyidaw - Anisakan,Beechcraft 1900D,4610,E-325,14,12,2,12,11,1,0,The plane was carrying military personnel and ...
5006,"July 04, 2021",11:30,"Patikul, Sulu, Philippines",Military - Philippine Air Force,?,Cagayan de Oro-Lumbia - Jolo,Lockheed C-130H Hercules,5125,5125,96,88,8,50,?,?,3,"While attempting to land at Jolo Airport, the ..."


### Veamos los valores nulos [Let's check the null/na values]

In [69]:
#Como los valores nulos tienen un signo de pregunta los vamos a remplazar con NaN para trabajarlo fácilmente
#[replace '?' with NaN to easy manipulate data]
flight_accident = flight_accident.replace('?', np.NaN)

In [78]:
# Veo % de nulos en todo el dataset
# Let's see % null/na on our dataset
na_ratio = ((flight_accident.isnull().sum() / len(flight_accident))*100).sort_values(ascending = False)
print(na_ratio)

flight_no                 73.522364
HORA declarada            30.031949
route                     15.215655
cn_ln                     13.318690
registration               5.431310
crew_fatalities            4.692492
passenger_fatalities       4.692492
PASAJEROS A BORDO          4.412939
crew_aboard                4.373003
summary                    1.178115
ground                     0.878594
all_aboard                 0.339457
ac_type                    0.259585
OperadOR                   0.199681
cantidad de fallecidos     0.159744
Ruta                       0.099840
fecha                      0.000000
dtype: float64


In [88]:
flight_accident[flight_accident['flight_no'].notna()]

Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,flight_no,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
141,"January 16, 1928",,"Solis, Uruguay",Aeropostale,F-AIKG,Buenos Aires - Montevideo,Latecoere 25,F-AIKG,633,2,0,2,2,0,2,0,The wing on the mail plane detached from the a...
217,"January 19, 1930",18:23,"Oceanside, California",Maddux Airlines,7,"Aqua Caliente, Mexico - Los Angeles",Ford 5-AT-C Tri Motor,NC9689,5-AT-046,16,14,2,16,14,2,0,"While en route to Los Angeles, the pilot, flyi..."
247,"March 31, 1931",1045,"Bazaar, Kansas",Trans Continental and Western Air,599,Kansas City - Wichita - Los Angeles,Fokker F10A Trimotor,NC-999,1063,8,6,2,8,6,2,0,"Shortly after taking off from Kansas City, one..."
348,"August 31, 1934",2245,"Amazonia, Missouri",Rapid Air Transport,6,Kansas City - Saint Joseph - Omaha,Stinson SM-6000B,NC11118,5004,5,4,1,5,4,1,0,The plane crashed about 11 miles from St. Jose...
371,"May 06, 1935",0330,"Atlanta, Missouri",Trans Continental and Western Air,6,Los Angeles - Albuquerque - Kanasas City - Wa...,Douglas DC-2-112,NC13785,1295,14,12,2,5,3,2,0,The plane crashed while en route from Albuquer...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,"February 05, 2020",1819,"Istanbul, Turkey",Pegasus Airlines,PC2193,Izmir - Istanbul,Boeing 737-86J,TC-IZK,37742,177,171,6,3,3,0,0,While landing duriing high winds and heavey ra...
4997,"May 22, 2020",1440,"Karachi, Pakistan",Pakistan International Airline,PK8303,Lahore - Karachi,Airbus A320-214,AP-BLD,2274,99,91,8,97,89,8,1,"While landing, the aircraft scraped the runway..."
4998,"August 07, 2020",1914,"Calicut, India",Air India Exppress,IX344,Dubai - Calicut,Boeing 737-8HG,VT-AXH,36323/2108,190,184,6,20,18,2,0,The flight IX344 suffered a runway excursion w...
5001,"January 09, 2021",1440,"Near Jakarta, Indonesia",Sriwijaya Air,SJ182,Jakarta - Pontianak,Boeing 737-524,PK-CLC,27323/2616,62,56,6,62,56,6,0,Sriwijaya Air flight 182 was climbing through ...


Este es el primer punto donde notamos inconcistencia, ya que algunos vuelos tienen números y otros letras y números. 
[Some problems here, there are some values just with numbers and other with letters+numers.]


Después de investigar un poco, resulta que si bien hay una generalidad en cuanto a cómo se compone el número de vuelo, no es una regla.
Por este motivo, no estamos en posición de sustituir los datos. 

[After some research, we notice that 'Flight_number' cannot be made in a unique way. Therefore, we can't add this information/ (at this stage)]

https://blueskypit.com/2020/03/09/deciphering-the-digits-in-your-flight-number/#:~:text=With%20a%20few%20exceptions%2C%20flights,character%20code%20identifying%20the%20airline.

Además, los números pueden cambiar. Especialemnte si hubo un accidente en ese num de vuelo. Podemos ver algunas explicaciones en estas fuentes:
[Flight numbers can be changed, specially if there were an accident involved. Some deeper explain below]

- https://blueskypit.com/2020/03/09/deciphering-the-digits-in-your-flight-number/#:~:text=With%20a%20few%20exceptions%2C%20flights,character%20code%20identifying%20the%20airline.

- https://aviation.stackexchange.com/questions/33766/why-do-airlines-retire-the-flight-number-after-a-crash

In [89]:
# Podemos chequear que en general los valores que se repiten son número que no parecen representar un num de vuelo real.
# [We can see there are only a few repetition and for what is look like a ramdon number. This column don't bring info]
flight_accident['flight_no'].value_counts()

-        36
1        11
101       9
4         7
901       7
         ..
1036      1
696       1
446       1
631       1
SJ182     1
Name: flight_no, Length: 892, dtype: int64

In [92]:
flight_accident

Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
0,"September 17, 1908",1718,"Fort Myer, Virginia",Military - U.S. Army,Demonstration,Wright Flyer III,,1,2,1,1,1,1,0,0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",,"Juvisy-sur-Orge, France",,Air show,Wright Byplane,SC1,,1,0,1,1,0,0,0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",0630,"Atlantic City, New Jersey",Military - U.S. Navy,Test flight,Dirigible,,,5,0,5,5,0,5,0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",,"Victoria, British Columbia, Canada",Private,,Curtiss seaplane,,,1,0,1,1,0,1,0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",1830,Over the North Sea,Military - German Navy,,Zeppelin L-1 (airship),,,20,,,14,,,0,The airship flew into a thunderstorm and encou...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5003,"March 28, 2021",1835,"Near Butte, Alaska",Soloy Helicopters,Sightseeing Charter,Eurocopter AS350B3 Ecureuil,N351SH,4598,6,5,1,5,4,1,0,The sightseeing helicopter crashed after missi...
5004,"May 21, 2021",1800,"Near Kaduna, Nigeria",Military - Nigerian Air Force,,Beechcraft B300 King Air 350i,NAF203,FL-891,11,7,4,11,7,4,0,"While on final approach, in poor weather condi..."
5005,"June 10, 2021",0800,"Near Pyin Oo Lwin, Myanmar",Military - Myanmar Air Force,Naypyidaw - Anisakan,Beechcraft 1900D,4610,E-325,14,12,2,12,11,1,0,The plane was carrying military personnel and ...
5006,"July 04, 2021",11:30,"Patikul, Sulu, Philippines",Military - Philippine Air Force,Cagayan de Oro-Lumbia - Jolo,Lockheed C-130H Hercules,5125,5125,96,88,8,50,,,3,"While attempting to land at Jolo Airport, the ..."


In [91]:
# Por lo mencionado anteriormente vamos a eliminar la columna del
# For all the explain above, we will remove the fligh_no column
flight_accident.drop(['flight_no'], axis=1, inplace=True)

KeyError: "['flight_no'] not found in axis"