In [1]:
import numpy as np
import pandas as pd
from psycopg2 import sql
import assets.script as script
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB

### Example `script.py` Use

In [2]:
x = [np.random.randn(10) for i in range(100)]
y = np.random.randint(1, 100, 100)

In [3]:
df = pd.DataFrame(x, columns=['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'])
df = df.join(pd.Series(y, name='target'))

In [4]:
X_train, X_test, y_train, y_test = script.split_data(df, target='target')


No columns dropped.

Target values: [81 97  5  4 57 26 27 64 92 15 38 70 31 35 38 12 68 87 12 76 78 46 92 50
 96 27 57 85 21 72 91 57 39 53 89 24 38 10 27 35 30 86 35 91 71  9 89 49
 99 17 22  8 60 39 55 95 52 46 91 62 23 74 67 76 41 60 95  4 25 76 34 54
 62 64  8 55 60 85  9 42 13 73  7 31 56 96 59 17 49 31 97 38 30 76 87 17
 94  8 37 91] 

Column(s) remaining: Index(['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
       'nine', 'target'],
      dtype='object')

Data is unscaled.
training sample size:  80
testing sample size:  20


In [5]:
y_pred, y_prob = script.get_predictions(GaussianNB(), X_train, y_train, X_test)

In [6]:
query = """
SELECT * FROM flights
LIMIT 200
"""

script.make_csv(query, 'test_lee.csv')

File exists. Returning DataFrame...


Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-01-07,DL,DL,DL,1521,DL,N811DZ,1521,13485,MSN,...,311.0,,,,,,,,,
1,2018-01-07,DL,DL,DL,1522,DL,N964DN,1522,14730,SDF,...,321.0,204.0,0.0,0.0,0.0,0.0,,,,
2,2018-01-07,DL,DL,DL,1523,DL,N340NB,1523,10721,BOS,...,632.0,,,,,,,,,
3,2018-01-07,DL,DL,DL,1524,DL,N948AT,1524,13487,MSP,...,596.0,,,,,,,,,
4,2018-01-07,DL,DL,DL,1525,DL,N963DN,1525,13487,MSP,...,1501.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2018-01-07,DL,DL,DL,1699,DL,N152DL,1699,14869,SLC,...,2935.0,,,,,,,,,
196,2018-01-07,DL,DL,DL,1700,DL,N319DN,1700,10721,BOS,...,946.0,,,,,,,,,
197,2018-01-07,DL,DL,DL,1701,DL,N6709,1701,10397,ATL,...,1747.0,,,,,,,,,
198,2018-01-07,DL,DL,DL,1702,DL,N362NW,1702,12953,LGA,...,1076.0,,,,,,,,,


## Midterm Project

#### Goals

The goal is to `predict arrival delays` of commercial flights. Often, there isn't much airlines can do to avoid the delays, which are often costly and . It is critical for airlines to estimate flight delays as accurate as possible because the results can be applied to improvements in customer satisfaction and income of airline agencies.

#### SQL Query

In [7]:
table_name = 'flights_test'
limit = 10000

query = sql.SQL(
    "SELECT * FROM {table} LIMIT %s").format(
        table = sql.Identifier(table_name),
    )
    
filename = 'flights_test_10k_sample.csv'

In [8]:
script.read_tables()

0             flights
1        flights_test
2    fuel_comsumption
3         pass_sample
4          passengers
5              sample
6          test_table
dtype: object

In [9]:
df_flights = script.make_csv(query, filename, limit)

File exists. Returning DataFrame...


In [10]:
df_flights.dtypes

fl_date               object
mkt_unique_carrier    object
branded_code_share    object
mkt_carrier           object
mkt_carrier_fl_num     int64
op_unique_carrier     object
tail_num              object
op_carrier_fl_num      int64
origin_airport_id      int64
origin                object
origin_city_name      object
dest_airport_id        int64
dest                  object
dest_city_name        object
crs_dep_time           int64
crs_arr_time           int64
dup                   object
crs_elapsed_time       int64
flights                int64
distance               int64
dtype: object