In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
countries = ['Finland', 'USA', 'Japan', 'Thailand', 'United Kingdom', 'Germany',
       'China', 'Hong Kong (SAR) China', 'Sweden', 'Singapore', 'Spain',
       'Korea Republic of', 'France', 'Denmark', 'India',
       'Russian Federation', 'Italy', 'Norway', 'Greece', 'Netherlands',
       'Estonia', 'Poland', 'Switzerland', 'Hungary', 'Austria',
       'Czech Republic', 'Portugal']

nationalities = ['FI', 'SE', 'DE', 'JP', 'US', 'GB', 'CN', 'RU', 'KR', 'HK', 'NO',
       'AU', 'FR', 'EE', 'DK']

## Data preprocesing automotization

In [None]:
def preprocess_data(df, nationalities, countries):
#     df['target'] = df['SSR_CODE'] == 'VGML'
    df['target'] = ~df['SSR_CODE'].isna()
    df = df.drop(columns=['DV_LEG_H_ID','DV_SEGMENT_H_ID', 'TICKET_TYPE','ALDES','DEPSTN', 'AIRCRAFT_REGISTRATION', 'AIRCRAFT_SUBTYPE',  'FLTDATE_LOCAL', 'NATIONALITY', 'SSR_CODE', 'SSR_SEAT_MEAL'])
    df['LDEPTIME_LOCAL'] = pd.to_datetime(df['LDEPTIME_LOCAL'])
    
    df = df[df.GENDER2.isin(['Male', 'Female'])]
    df['GENDER2'] = df['GENDER2'] == 'Male'
    
    df = df[df.POINT_OF_SALE.isin(nationalities) & (df.ARR_COUNTRY.isin(countries))]
    df = df.drop(columns=['POINT_OF_SALE'])
    
    df = df.drop(columns=['ARRSTN'])
    
    df = df[df.SERVICE_CLASS.isin(['ECONOMY', 'BUSINESS'])]
    df['SERVICE_CLASS'] = df['SERVICE_CLASS'] == 'BUSINESS'
    
    df['FLIGHT_DURATION'] = df['FLIGHT_DURATION'] / 60
    
    df['ROUTE_TYPE'] = df['ROUTE_TYPE'] == 'LH'
    
    df['IS_DAYTIME'] = df['LDEPTIME_LOCAL'].apply(lambda x: x.hour >= 6 and x.hour <= 18)
    df['SEASON'] = df['LDEPTIME_LOCAL'].apply(lambda x: (x.month % 12) // 3)
    
    df = df.drop(columns=['FLTNBR', 'LDEPTIME_LOCAL'])
    
    df = pd.get_dummies(df, columns=['ARR_COUNTRY', 'SEASON','TRANSFER_STATUS','BOOKING_CLASS'])
    
    labels = df[['target']].astype('int32')
    df = df.drop(columns=['target'])
    
    return df, labels

In [None]:
train_data = pd.read_csv('ALL1.csv').sample(100000)
train_preprocessed_data, train_labels = preprocess_data(train_data, nationalities, countries)
# train_preprocessed_data = train_preprocessed_data[['FLIGHT_DURATION', 'ROUTE_TYPE', 'IS_FPLUS', 'GENDER2', 'SERVICE_CLASS']]

In [None]:
test_data = pd.read_csv('ALL1.csv').sample(10000)
test_preprocessed_data, test_labels = preprocess_data(test_data, nationalities, countries)
# test_preprocessed_data = test_preprocessed_data[['FLIGHT_DURATION', 'ROUTE_TYPE', 'IS_FPLUS', 'GENDER2', 'SERVICE_CLASS']]

## model 1 training 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

model_1 = DecisionTreeClassifier()

In [None]:
model_1.fit(train_preprocessed_data.values, train_labels.values)

In [None]:
y_test =  np.squeeze(train_labels.values)
threshold = 0.05

y_pred = np.array(model_1.predict_proba(train_preprocessed_data.values)[:, 1])
y_pred  = y_pred > threshold
y_pred = y_pred.astype(int)  

print(f"XGB accuracy on the test set : {round(np.sum(y_pred == y_test) / len(y_pred), 4)}")

In [None]:
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

print ('Accuracy:', accuracy_score(y_test, y_pred))
print ('F1 score:', f1_score(y_test, y_pred))
print ('Recall:', recall_score(y_test, y_pred))
print ('Precision:', precision_score(y_test, y_pred))
print ('\n clasification report:\n', classification_report(y_test,y_pred))
print ('\n confussion matrix:\n',confusion_matrix(y_test, y_pred))

In [None]:
feature_importances = pd.DataFrame(model_1.feature_importances_,
                                   index = test_preprocessed_data.columns,
                                    columns=['importances']).sort_values('importances', ascending=False)
display(feature_importances.head(7))

## model 2 training

In [None]:
index = [bool(p) for p in y_pred]
train2_preprocessed_data, train2_labels = train_preprocessed_data[index], train_labels[index]

In [None]:
model_2 = DecisionTreeClassifier()

In [None]:
model_2.fit(train2_preprocessed_data.values, train2_labels.values)

In [None]:
y_test =  np.squeeze(train2_labels.values)
threshold_2 = 0.4

y_pred = np.array(model_2.predict_proba(train2_preprocessed_data.values)[:, 1])
y_pred  = y_pred > threshold_2
y_pred = y_pred.astype(int)  

print(f"XGB accuracy on the test set : {round(np.sum(y_pred == y_test) / len(y_pred), 4)}")

In [None]:
print ('Accuracy:', accuracy_score(y_test, y_pred))
print ('F1 score:', f1_score(y_test, y_pred))
print ('Recall:', recall_score(y_test, y_pred))
print ('Precision:', precision_score(y_test, y_pred))
print ('\n clasification report:\n', classification_report(y_test,y_pred))
print ('\n confussion matrix:\n',confusion_matrix(y_test, y_pred))

In [None]:
feature_importances = pd.DataFrame(model_2.feature_importances_,
                                   index = train2_preprocessed_data.columns,
                                    columns=['importances']).sort_values('importances', ascending=False)
display(feature_importances.head(7))

## Model evaluation

In [None]:
val_data = pd.read_csv('ALL1.csv').sample(10000)
val_preprocessed_data, val_labels = preprocess_data(val_data, nationalities, countries)

In [None]:
y_pred = np.array(model_1.predict_proba(val_preprocessed_data.values)[:, 1])
y_pred = y_pred > threshold
y_pred = y_pred.astype(int)  

In [None]:
index = [bool(p) for p in y_pred]
val2_preprocessed_data, val2_labels = val_preprocessed_data[index], val_labels[index]

In [None]:
y_pred_2 = np.array(model_2.predict_proba(val2_preprocessed_data.values)[:, 1])
y_pred_2 = y_pred_2 > threshold_2
y_pred_2 = y_pred_2.astype(int)  

In [None]:
pred_labels = y_pred
pred_labels[pred_labels == 1] = y_pred_2

In [None]:
y_test = np.squeeze(val_labels.values)

In [None]:
print ('Accuracy:', accuracy_score(y_test, y_pred))
print ('F1 score:', f1_score(y_test, y_pred))
print ('Recall:', recall_score(y_test, y_pred))
print ('Precision:', precision_score(y_test, y_pred))
print ('\n clasification report:\n', classification_report(y_test,y_pred))
print ('\n confussion matrix:\n',confusion_matrix(y_test, y_pred))

## ------------------------
## Outlier Detection

In [None]:
train_data = pd.read_csv('ALL1.csv').sample(50000)
train_preprocessed_data, train_labels = preprocess_data(train_data, nationalities, countries)
train_preprocessed_data = train_preprocessed_data[['FLIGHT_DURATION', 'ROUTE_TYPE', 'IS_FPLUS', 'GENDER2', 'SERVICE_CLASS']]

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import confusion_matrix

clf = IsolationForest(contamination = float(np.sum(train_labels) / train_labels.shape[0])).fit(train_preprocessed_data)

In [None]:
test_data = pd.read_csv('ALL1.csv').sample(10000)
test_preprocessed_data, test_labels = preprocess_data(test_data, nationalities, countries)
test_preprocessed_data = test_preprocessed_data[['FLIGHT_DURATION', 'ROUTE_TYPE', 'IS_FPLUS', 'GENDER2', 'SERVICE_CLASS']]

In [None]:
y_pred = clf.predict(test_preprocessed_data)

In [None]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print ('Confussion matrix:\n', confusion_matrix(test_labels, y_pred))

In [None]:
lof = LocalOutlierFactor(novelty=True)
lof.fit(train_preprocessed_data)

In [None]:
y_pred = lof.predict(test_preprocessed_data)

In [None]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print ('Confussion matrix:\n', confusion_matrix(test_labels, y_pred))