In [375]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import airportsdata
from datetime import *
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix, make_scorer, fbeta_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import NearMiss

RSEED = 42

In [376]:
def rmse(y_pred, y_test):
    return np.sqrt(np.sum((y_pred[i]-y_test[i])**2 for i in range(len(y_test)))/len(y_test))

In [377]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [378]:
load_data = pd.DataFrame(airportsdata.load('IATA'))
airport_data = load_data.T.reset_index()
df_new = df_train.merge(airport_data, how= 'left', left_on ='DEPSTN', right_on ='index').merge(airport_data, how= 'left', left_on='ARRSTN', right_on='index')
df_new.columns = df_new.columns.str.replace('_x', '_dep').str.replace('_y', '_arr')

In [379]:
df_new["STD"]=df_new["STD"].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
df_new["STA"]=df_new["STA"].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H.%M.%S'))
df_new['year'] = df_new['STD'].apply(lambda x: x.year).apply(pd.to_numeric)
df_new['dep_date'] = df_new['STD'].apply(lambda x: datetime.strftime(x,'%m-%d'))
df_new['dep_time'] = df_new['STD'].apply(lambda x: datetime.strftime(x,'%H:%M:%S'))
df_new['arr_date'] = df_new['STA'].apply(lambda x: datetime.strftime(x,'%m-%d'))
df_new['arr_time'] = df_new['STA'].apply(lambda x: datetime.strftime(x,'%H:%M:%S'))
df_new['sch_duration'] = ((df_new['STA'] -df_new['STD'])/timedelta(minutes=1))

df_new = df_new.query(' (not (arr_date < dep_date and dep_date != "12-31"))')
df_new['dep_month'] = df_new['STD'].apply(lambda x: x.month).apply(pd.to_numeric)
df_new['arr_month'] = df_new['STA'].apply(lambda x: x.month).apply(pd.to_numeric)
df_new['dep_day'] = df_new['STD'].apply(lambda x: datetime.strftime(x,'%d')).apply(pd.to_numeric)
df_new['arr_day'] = df_new['STA'].apply(lambda x: datetime.strftime(x,'%d')).apply(pd.to_numeric)
df_new['dep_hour'] = df_new['STD'].apply(lambda x: x.hour).apply(pd.to_numeric)
df_new['arr_hour'] = df_new['STA'].apply(lambda x: x.hour).apply(pd.to_numeric)

df_new = df_new.query('DEPSTN != "SXF" and ARRSTN != "SXF"')
df_new.AC = df_new.AC.apply(lambda x: x.split(sep=' ')[0])

df_new['dur_cat'] = pd.cut(df_new.sch_duration, bins=[0, 120, 240, 360, 480, 600, 720, 840, 960, 1080, 1200, 1320, 1000000]).apply(lambda x: x.right).astype(int)

distances = pd.read_csv('data/distance.csv')
df_new = df_new.merge(distances, on='ID')

In [380]:
df_new['c_week'] = df_new['STA'].dt.week.astype(int)

In [381]:
df_new.head()

Unnamed: 0.1,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,...,dep_month,arr_month,dep_day,arr_day,dep_hour,arr_hour,dur_cat,Unnamed: 0,trav_dist,c_week
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,ATA,TU,260.0,...,1,1,3,3,10,12,240,0,1666.949101,53
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16:55:00,ATA,TU,20.0,...,1,1,13,13,15,16,120,1,983.064595,2
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU,0.0,...,1,1,16,16,4,6,240,2,1673.053088,2
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17:00:00,ATA,TU,0.0,...,1,1,17,17,14,17,240,3,1805.107067,2
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15:50:00,ATA,TU,22.0,...,1,1,17,17,14,15,120,4,626.108971,2


In [382]:
df_new[['sch_duration', 'dur_cat']].describe()

Unnamed: 0,sch_duration,dur_cat
count,107161.0,107161.0
mean,198.316057,3680.668154
std,4301.201501,58883.59178
min,10.0,120.0
25%,95.0,120.0
50%,140.0,240.0
75%,165.0,240.0
max,719520.0,1000000.0


In [383]:
df_new.dur_cat

0         240
1         120
2         240
3         240
4         120
         ... 
107156    240
107157    120
107158    480
107159    120
107160    120
Name: dur_cat, Length: 107161, dtype: int64

In [384]:
list1=[]
bestscore=130

In [385]:
new_cols = ['STATUS', 'AC', 'target', 'country_dep', 'country_arr', 'trav_dist', 'year', 'dep_month',  'dep_day' , 'dep_hour', 'arr_month', 'arr_day',  'arr_hour', 'dur_cat', 'c_week']
#new_cols = ['STATUS', 'AC', 'target', 'trav_dist', 'year', 'dep_month',  'dep_day' , 'dep_hour', 'arr_month', 'arr_day',  'arr_hour', 'dur_cat']

In [386]:
df_new = pd.get_dummies(data=df_new[new_cols], prefix='con', prefix_sep='_', drop_first=True)

In [387]:
rel_params = ['arr_month', 'year', 'con_SCH', 'dep_hour', 'arr_hour', 'dep_month', 'dep_day', 'arr_day', 'trav_dist', 'con_AT', 'con_TN', 'c_week', 'target']
df_new = df_new[rel_params]

In [388]:
scaler = StandardScaler()

In [389]:
scaler.fit_transform(df_new.loc[:, df_new.columns != 'target'])

array([[-1.72435204, -1.26225482, -0.37528113, ..., -1.15274783,
         0.86802117,  1.79015541],
       [-1.72435204, -1.26225482, -0.37528113, ..., -1.15274783,
         0.86802117, -1.75474603],
       [-1.72435204, -1.26225482, -0.37528113, ...,  0.86749242,
        -1.15204564, -1.75474603],
       ...,
       [ 1.31112234,  1.19689831,  2.66466904, ...,  0.86749242,
         0.86802117,  1.23409244],
       [-1.72435204,  1.19689831, -0.37528113, ...,  0.86749242,
         0.86802117, -1.61573029],
       [ 1.31112234,  1.19689831,  2.66466904, ...,  0.86749242,
         0.86802117,  1.30360031]])

In [390]:
low_range=df_new.query('trav_dist < 1500')
middle_range=df_new.query('trav_dist > 1500 & trav_dist<3500')
long_range=df_new.query('trav_dist > 3500')

In [391]:
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(low_range.drop(['target'], axis=1), low_range.target, random_state=RSEED)
X_train_middle, X_test_middle, y_train_middle, y_test_middle = train_test_split(middle_range.drop(['target'], axis=1), middle_range.target, random_state=RSEED)
X_train_long, X_test_long, y_train_long, y_test_long = train_test_split(long_range.drop(['target'], axis=1), long_range.target, random_state=RSEED)

In [392]:
lin_reg_low = LinearRegression()
lin_reg_middle = LinearRegression()
lin_reg_long = LinearRegression()

lin_reg_low.fit(X_train_low, y_train_low)
lin_reg_middle.fit(X_train_middle, y_train_middle)
lin_reg_long.fit(X_train_long, y_train_long)

LinearRegression()

In [393]:
y_train_low_pred = lin_reg_low.predict(X_train_low)
y_test_low_pred = lin_reg_low.predict(X_test_low)
y_train_middle_pred = lin_reg_middle.predict(X_train_middle)
y_test_middle_pred = lin_reg_middle.predict(X_test_middle)
y_train_long_pred = lin_reg_long.predict(X_train_long)
y_test_long_pred = lin_reg_long.predict(X_test_long)

In [394]:
print('low range flights: ', rmse(y_train_low_pred, y_train_low.to_numpy()), rmse(y_test_low_pred, y_test_low.to_numpy()))
print('middle range flights: ', rmse(y_train_middle_pred, y_train_middle.to_numpy()), rmse(y_test_middle_pred, y_test_middle.to_numpy()))
print('long range flights: ', rmse(y_train_long_pred, y_train_long.to_numpy()), rmse(y_test_long_pred, y_test_long.to_numpy()))

low range flights:  99.81158262756671 112.21057896468118
middle range flights:  129.43325800665838 138.58324360951266
long range flights:  151.29270701762957 146.18105349529333


In [395]:
X_train_low.columns

Index(['arr_month', 'year', 'con_SCH', 'dep_hour', 'arr_hour', 'dep_month',
       'dep_day', 'arr_day', 'trav_dist', 'con_AT', 'con_AT', 'con_TN',
       'con_TN', 'c_week'],
      dtype='object')

## principal feasability

In [396]:
lin_reg_lowpr = LinearRegression()
lin_reg_middlepr = LinearRegression()
lin_reg_longpr = LinearRegression()

lin_reg_lowpr.fit(low_range.loc[:, low_range.columns != 'target'], low_range.target)
lin_reg_middlepr.fit(middle_range.loc[:, middle_range.columns != 'target'], middle_range.target)
lin_reg_longpr.fit(long_range.loc[:, long_range.columns != 'target'], long_range.target)

LinearRegression()

In [397]:
low_pred = lin_reg_lowpr.predict(low_range.loc[:, low_range.columns != 'target'])
middle_pred = lin_reg_middlepr.predict(middle_range.loc[:, middle_range.columns != 'target'])
long_pred = lin_reg_longpr.predict(long_range.loc[:, long_range.columns != 'target'])

In [398]:
print(rmse(low_pred, low_range.target.to_numpy()), rmse(middle_pred, middle_range.target.to_numpy()), rmse(long_pred, long_range.target.to_numpy()))

103.0463957898299 131.7729141951683 149.96880163140312


In [399]:
pred = []
lowc=0
middlec=0
longc=0
for i in range(df_new.shape[0]):
    rangei = df_new.iloc[i].trav_dist
    if rangei < 1500:
        pred.append(low_pred[lowc])
        lowc+=1
    if rangei <= 3500 and rangei>=1500:
        pred.append(middle_pred[middlec])
        middlec+=1
    if rangei > 3500:
        pred.append(low_pred[longc])
        longc+=1

In [400]:
rmse(pred, df_new.target)

113.93900274680499

In [401]:
lrwhole = LinearRegression()
lrwhole.fit(df_new.loc[:, df_new.columns != 'target'], df_new.target)
wholepred = lrwhole.predict(df_new.loc[:, df_new.columns != 'target'])
rmse(wholepred, df_new.target.to_numpy())

114.06848387817769

## generalization to other columns

In [402]:
def recover_pred(elements, el_preds, dframe, fname):
    pred = []
    n = len(el_preds)
    counter_mat = np.zeros((n), dtype=int)

    for i in range(dframe.shape[0]):

        cat = dframe[fname].iloc[i]

        el_ind = list(elements).index(cat)
        pred.append(el_preds[el_ind][counter_mat[el_ind]])
        counter_mat[el_ind]+=1

    return pred

In [403]:
dataframe = df_new

In [404]:
def critmask(x):
    if x > 3.*60:
        return 1
    else:
        return 0
dataframe['critical'] = dataframe.target.apply(lambda x: critmask(x))

In [405]:
rel_params = ['arr_month', 'year', 'con_SCH', 'dep_hour', 'arr_hour', 'dep_month', 'dep_day', 'dur_cat', 'arr_day', 'trav_dist', 'con_AT', 'con_TN', 'target', 'c_week']

In [406]:
df_new.head()

Unnamed: 0,arr_month,year,con_SCH,dep_hour,arr_hour,dep_month,dep_day,arr_day,trav_dist,con_AT,con_AT.1,con_TN,con_TN.1,c_week,target,critical
0,1,2016,0,10,12,1,3,3,1666.949101,0,0,0,1,53,260.0,1
1,1,2016,0,15,16,1,13,13,983.064595,0,0,0,1,2,20.0,0
2,1,2016,0,4,6,1,16,16,1673.053088,0,0,1,0,2,0.0,0
3,1,2016,0,14,17,1,17,17,1805.107067,0,0,1,0,2,0.0,0
4,1,2016,0,14,15,1,17,17,626.108971,0,0,1,0,2,22.0,0


In [412]:
## first get the unique elements in the list
fname = 'dep_month'

elements = dataframe[fname].unique()
n_uniq = len(elements)

In [413]:
weights = []
el_predictions = []

for element in elements:

    el_frame = dataframe[dataframe[fname]==element]
    X = el_frame.drop('target', axis=1)
    y = el_frame.target

    linreg = LinearRegression()
    linreg.fit(X, y)
    el_predictions.append(linreg.predict(X))
    weights.append(linreg.coef_)

In [414]:
pred_new = recover_pred(elements, el_predictions, dataframe, fname)

In [415]:
min(pred_new)

-64.01998512202772

In [416]:
print(rmse(pred_new, dataframe.target.to_numpy()), rmse(np.abs(np.array(pred_new)/1.0), dataframe.target.to_numpy()))

77.8607842394547 77.85913344626469
