 # Opis notatnika
 Głównym celem w tym notatniku jest zasilenie bazy danych pobranymi danymi 
 > odpowiednie dostosowanie struktury danych z plików źródłowych do formatu zgodnego z `Postgres`, a następnie wgranie ich na serwer. 

## Połączenie z bazą danych
Konfiguracja połączenia

In [24]:
username = 'postgres'
password = '****'

host = 'localhost'
database = 'airlines'
port= '5432'

 Import wymaganych bibliotek

In [25]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from sqlalchemy import text

 Tworzę zmienne `url` oraz `engine`

In [26]:
url = URL.create(
    "postgresql",
    username=username,
    password=password,
    host=host,
    port=port,
    database=database,
)
engine = create_engine(url)

 # Załadowanie ramek do obszaru roboczego
 Implementacja funkcji `load_raw_data`, która przyjmuje jeden parametr:
 * `file_name` - nazwa pliku do zaczytania

 Jej zadaniem jest wczytanie surowego pliku, zmodyfikowanie nazw kolumn z `NAZWA_KOLUMNY` na `nazwa_kolumny` oraz zwrócenie tak zmodyfikowanej ramki danych.


In [27]:
def load_raw_data(file_name):
    file_path = r"..\data\raw\{}.csv".format(file_name)
    df_raw = pd.read_csv(file_path, index_col=0)
    df_raw.columns = df_raw.columns.str.lower()

    return df_raw

 #### Zaczytanie poszczególnych plików do ramek

In [28]:
aircraft_df = load_raw_data('aircraft')
aircraft_df.head(5)

Unnamed: 0,manufacture_year,tail_num,number_of_seats
0,1944,N54514,0.0
1,1945,N1651M,0.0
2,1953,N100CE,0.0
3,1953,N141FL,0.0
4,1953,N151FL,0.0


In [29]:
airport_list_df = load_raw_data('airport_list')
airport_list_df.head(5)

Unnamed: 0,origin_airport_id,display_airport_name,origin_city_name,name
0,11638,Fresno Air Terminal,"Fresno, CA","FRESNO YOSEMITE INTERNATIONAL, CA US"
1,13342,General Mitchell Field,"Milwaukee, WI","MILWAUKEE MITCHELL AIRPORT, WI US"
2,13244,Memphis International,"Memphis, TN","MEMPHIS INTERNATIONAL AIRPORT, TN US"
3,15096,Syracuse Hancock International,"Syracuse, NY","SYRACUSE HANCOCK INTERNATIONAL AIRPORT, NY US"
4,10397,Atlanta Municipal,"Atlanta, GA",ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...


In [30]:
airport_weather_df = load_raw_data('airport_weather')
airport_weather_df.head(5)

Unnamed: 0,wt18,station,name,date,awnd,prcp,snow,snwd,tavg,tmax,...,pgtm,wt10,wesd,sn32,sx32,psun,tsun,tobs,wt07,wt11
0,,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,2019-01-01,4.7,0.14,0.0,0.0,64.0,66.0,...,,,,,,,,,,
1,,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,2019-01-02,4.92,0.57,0.0,0.0,56.0,59.0,...,,,,,,,,,,
2,,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,2019-01-03,5.37,0.15,0.0,0.0,52.0,55.0,...,,,,,,,,,,
3,,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,2019-01-04,12.08,1.44,0.0,0.0,56.0,66.0,...,,,,,,,,,,
4,,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,2019-01-05,13.42,0.0,0.0,0.0,49.0,59.0,...,,,,,,,,,,


In [31]:
flight_df = load_raw_data('flight')
flight_df.head(5)

Unnamed: 0,month,day_of_month,day_of_week,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,...,crs_elapsed_time,actual_elapsed_time,distance,distance_group,year,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,1,20,7,WN,N204WN,682,10397,11292,605,602.0,...,205,204.0,1199,5,2019,,,,,
1,1,20,7,WN,N8682B,2622,10397,11292,2120,2114.0,...,210,205.0,1199,5,2019,,,,,
2,1,20,7,WN,N717SA,2939,10397,11292,1800,1807.0,...,210,220.0,1199,5,2019,4.0,0.0,10.0,0.0,3.0
3,1,20,7,WN,N709SW,3848,10397,11292,1355,1354.0,...,205,204.0,1199,5,2019,,,,,
4,1,20,7,WN,N7864B,1352,10397,11697,1125,1125.0,...,120,124.0,581,3,2019,,,,,


 # Eksport danych na bazę

 Implementacja metody `export_table_to_db`, która przyjmuje dwa argumenty:
 * `df` - ramka danych do eksportu,
 * `table_name` - nazwa ramki na bazie.

 Implementacja metody `export_row_nr_check`, która przyjmuje jeden argument:
 * `table_name` - nazwa ramki na bazie.

In [32]:
def export_table_to_db(df, table_name):
    print(f"Loading data into {table_name}...")
    df.to_sql(name=table_name, con=engine, if_exists='append', index=False, chunksize=1000)
    print(f"Data fully loaded to {table_name}.")

def export_row_nr_check(table_name):
    with engine.connect() as conn:
        result = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}"))
        for row in result:
            return row[0]

 ## Wgrywanie danych

 ### Wgranie `aircraft_df` do tabeli `aircraft`

In [33]:
export_table_to_db(aircraft_df, 'aircraft')

Loading data into aircraft...
Data fully loaded to aircraft.


In [34]:
export_row_nr_check('aircraft')

7383

 ### Wgranie `airport_weather_df` do tabeli `airport_weather`

In [35]:
export_table_to_db(airport_weather_df, 'airport_weather')

Loading data into airport_weather...
Data fully loaded to airport_weather.


In [36]:
export_row_nr_check('airport_weather')

46226

 ### Wgranie `flight_df` do tabeli `flight`
 > Wykonanie tej komórki może zająć kilka-kilknaście minut za względu na ilość danych w ramce.

In [37]:
export_table_to_db(flight_df, 'flight')

Loading data into flight...
Data fully loaded to flight.


In [38]:
export_row_nr_check('flight')

1386120

 ### Wgranie `airport_list_df` do tabeli `airport_list`

In [39]:
export_table_to_db(airport_list_df, 'airport_list')

Loading data into airport_list...
Data fully loaded to airport_list.


In [40]:
export_row_nr_check('airport_list')

97

 # Sprawdzenie poprawności wykonania notatnika
 Kody poniżej sprawdzają, czy ta część została poprawnie wykonana

In [41]:
def test_data_export(table_name, expected_count, expected_schema):
    real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]
    
    real_schema = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 0", engine)
    real_schema = set(real_schema.columns)

    expected_schema = set(expected_schema)

    diff = real_schema.symmetric_difference(expected_schema)

    assert len(diff) == 0, ('Nie zgadzają się kolumny tabel....'
    f'\tOczekiwano: {expected_schema}'
    f'\tOtrzymano: {real_schema}'
    f'\tRóżnica: {diff}')

    assert expected_count == real_count, \
        f'Nie zgadza się liczba wierszy, oczekiwano {expected_count}, otrzymano {real_count} - sprawdź, czy nie dane nie zostały wgrane do tabeli "{table_name}" więcej niż raz.'

 ### Sprawdzenie tabeli `aircraft`

In [42]:
aircraft_expected_count = 7383
aircraft_expected_schema = ['id', 'manufacture_year', 'tail_num', 'number_of_seats']

test_data_export('aircraft', aircraft_expected_count, aircraft_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ### Sprawdzenie tabeli `airport_weather`

In [43]:
airport_weather_expected_count = 46226
airport_weather_expected_schema = [
       'id', 'station', 'name', 'date', 'awnd', 'prcp', 'snow', 'snwd', 'tavg', 
       'tmax', 'tmin', 'wdf2', 'wdf5', 'wsf2', 'wsf5', 'wt01', 'wt08', 'wt02',
       'wt03', 'wt04', 'wt09', 'wt06', 'wt05', 'pgtm', 'wt10', 'wesd', 'sn32',
       'sx32', 'psun', 'tsun', 'tobs', 'wt07', 'wt11', 'wt18']

test_data_export('airport_weather', airport_weather_expected_count, airport_weather_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ### Sprawdzenie tabeli `flight`

In [44]:
flight_expected_count = 1386120
flight_expected_schema = [
       'id', 'month', 'day_of_month', 'day_of_week', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id',
       'crs_dep_time', 'dep_time', 'dep_delay_new', 'dep_time_blk',
       'crs_arr_time', 'arr_time', 'arr_delay_new', 'arr_time_blk',
       'cancelled', 'crs_elapsed_time', 'actual_elapsed_time', 'distance',
       'distance_group', 'year', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']

test_data_export('flight', flight_expected_count, flight_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ### Sprawdzenie tabeli `airport_list`

In [45]:
airport_list_expected_count = 97
airport_list_expected_schema = ['id', 'origin_airport_id', 'display_airport_name', 'origin_city_name', 'name']

test_data_export('airport_list', airport_list_expected_count, airport_list_expected_schema)


  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 # Podsumowanie
 Baza danych zasilona. Teraz czas na pogłębioną analizę danych którą przeprowadzę w notatnikach `04_Analiza_danych...`.