In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('data VGML.csv')
df2 = pd.read_csv('data not VGML.csv')
df3 = pd.read_csv('data NULL.csv')

In [None]:
df = df.sample(20000, random_state=111)
df2 = df2.sample(10000, random_state=111)
df3 = df3.sample(10000, random_state=111)

In [None]:
df['target'] = 1
df2['target'] = 0
df3['target'] = 0

In [None]:
data = pd.concat([df, df2, df3], ignore_index=True)
data['LDEPTIME_LOCAL'] = pd.to_datetime(data['LDEPTIME_LOCAL'])

In [None]:
data.columns

In [None]:
data = data.drop(columns=['DV_LEG_H_ID', 'DV_SEGMENT_H_ID','DEPSTN', 'ALDES', 'AIRCRAFT_REGISTRATION', 'AIRCRAFT_SUBTYPE',  'FLTDATE_LOCAL', 'NATIONALITY', 'SSR_CODE', 'SSR_SEAT_MEAL'])

In [None]:
data

### Gender filtering

In [None]:
data = data[data.GENDER2.isin(['Male', 'Female'])]
data['GENDER2'] = data['GENDER2'] == 'Male'

In [None]:
sns.countplot(x='GENDER2', hue='target', data=data)

### Nationalities > 100 filtering

In [None]:
nationality_table = data[['POINT_OF_SALE', 'FLTNBR']].groupby('POINT_OF_SALE').count().sort_values('FLTNBR', ascending=False)
nationalities = nationality_table[nationality_table.FLTNBR > 100].index.values
data = data[data.POINT_OF_SALE.isin(nationalities)]

In [None]:
sns.countplot(x='POINT_OF_SALE', hue='target', data=data)

### Filtering ARR_COUNTRY

In [None]:
country_table = data[['ARR_COUNTRY', 'FLTNBR']].groupby('ARR_COUNTRY').count().sort_values('FLTNBR', ascending=False)
countries = country_table[country_table.FLTNBR > 40].index.values
data = data[data.ARR_COUNTRY.isin(countries)]

data = data.drop(columns=['ARRSTN'])

In [None]:
data

### SERVICE_CLASS Filtering

In [None]:
data = data[data.SERVICE_CLASS.isin(['ECONOMY', 'BUSINESS'])]
data['SERVICE_CLASS'] = data['SERVICE_CLASS'] == 'BUSINESS'

In [None]:
sns.countplot(x='SERVICE_CLASS', hue='target', data=data)

### FLIGHT_DURATION Filtering

In [None]:
data['FLIGHT_DURATION'] = data['FLIGHT_DURATION'] / 60
data['FLIGHT_DURATION'].hist()

### ROUTE_TYPE Filtering

In [None]:
data['ROUTE_TYPE'] = data['ROUTE_TYPE'] == 'LH'
sns.countplot(x='ROUTE_TYPE', hue='target', data=data)

### LDEPTIME_LOCAL Filtering

In [None]:
data['IS_DAYTIME'] = data['LDEPTIME_LOCAL'].apply(lambda x: x.hour >= 6 and x.hour <= 18)
data['SEASON'] = data['LDEPTIME_LOCAL'].apply(lambda x: (x.month % 12) // 3)

In [None]:
sns.countplot(x='SEASON', hue='target', data=data)

In [None]:
data = data.drop(columns=['FLTNBR', 'LDEPTIME_LOCAL'])

In [None]:
data

### Categorical features encoding

In [None]:
labels = data[['target']]
data = data.drop(columns=['target'])

In [None]:
train_data = pd.get_dummies(data, columns=['POINT_OF_SALE', 'ARR_COUNTRY', 'SEASON'])

X = train_data.values
y = labels.values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 111, test_size = 0.2) 

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
model = RandomForestClassifier()

np.random.seed(1111)

model.fit(X_train, y_train)

In [None]:
print(f"Accuracy of RF {model.score(X_test, y_test)*100}%")

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
conf_mat = confusion_matrix(y_test, predictions)
print(conf_mat)

In [None]:
disp = plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Blues)


In [None]:
importances=model.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = train_data.columns,
                                    columns=['importances']).sort_values('importances', ascending=False)
display(feature_importances.head(15))

In [None]:
from matplotlib import pyplot

In [None]:
pyplot.bar([x for x in range(len(importances))], importances)
pyplot.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

print ('Accuracy:', accuracy_score(y_test, predictions))
print ('F1 score:', f1_score(y_test, predictions))
print ('Recall:', recall_score(y_test, predictions))
print ('Precision:', precision_score(y_test, predictions))
print ('\n clasification report:\n', classification_report(y_test,predictions))
print ('\n confussion matrix:\n',confusion_matrix(y_test, predictions))

### Data preprocesing automotization

In [None]:
def preprocess_data(df, nationalities, countries):
    df['target'] = df['SSR_CODE'] == 'VGML'
    df = df.drop(columns=['DV_LEG_H_ID', 'DV_SEGMENT_H_ID', 'ALDES','DEPSTN', 'AIRCRAFT_REGISTRATION', 'AIRCRAFT_SUBTYPE',  'FLTDATE_LOCAL', 'NATIONALITY', 'SSR_CODE', 'SSR_SEAT_MEAL'])
    df['LDEPTIME_LOCAL'] = pd.to_datetime(df['LDEPTIME_LOCAL'])
    
    df = df[df.GENDER2.isin(['Male', 'Female'])]
    df['GENDER2'] = df['GENDER2'] == 'Male'
    
    df = df[df.POINT_OF_SALE.isin(nationalities) & (df.ARR_COUNTRY.isin(countries))]
    
    df = df.drop(columns=['ARRSTN'])
    
    df = df[df.SERVICE_CLASS.isin(['ECONOMY', 'BUSINESS'])]
    df['SERVICE_CLASS'] = df['SERVICE_CLASS'] == 'BUSINESS'
    
    df['FLIGHT_DURATION'] = df['FLIGHT_DURATION'] / 60
    
    df['ROUTE_TYPE'] = df['ROUTE_TYPE'] == 'LH'
    
    df['IS_DAYTIME'] = df['LDEPTIME_LOCAL'].apply(lambda x: x.hour >= 6 and x.hour <= 18)
    df['SEASON'] = df['LDEPTIME_LOCAL'].apply(lambda x: (x.month % 12) // 3)
    
    df = df.drop(columns=['FLTNBR', 'LDEPTIME_LOCAL'])
    
    df = pd.get_dummies(df, columns=['POINT_OF_SALE', 'ARR_COUNTRY', 'SEASON'])
    
    labels = df[['target']].astype('int32')
    df = df.drop(columns=['target'])
    
    return df, labels

In [None]:
test_data = pd.read_csv('ALL.csv').sample(15000)
test_preprocessed_data, test_labels = preprocess_data(test_data, nationalities, countries)

In [None]:
test_preprocessed_data

In [None]:
X = test_preprocessed_data.values
y = test_labels.values

In [None]:
print(f"Accuracy of RF on unseen data: {model.score(X, y)*100}%")

In [None]:
[f for f in train_data.columns.tolist() if f not in test_preprocessed_data.columns.tolist()]

# -------------------------------

In [None]:
test_df = pd.read_csv('data VGML.csv').sample(10000)
test_df2 = pd.read_csv('data not VGML.csv').sample(5000)
test_df3 = pd.read_csv('data NULL.csv').sample(5000)

In [None]:
test_data = pd.concat([test_df, test_df2, test_df3], ignore_index=True)
test_preprocessed_data, test_labels = preprocess_data(test_data, nationalities, countries) 

In [None]:
X = test_preprocessed_data.values
y = test_labels.values

In [None]:
print(f"Accuracy of RF on unseen data: {model.score(X, y)*100}%")

# -------------------------------

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(max_depth=6)

np.random.seed(1111) 
model_dt.fit(X_train, y_train)

In [None]:
print(f"Accuracy of DT on unseen data: {model_dt.score(X, y)*100}%")

In [None]:
disp = plot_confusion_matrix(model_dt, X_test, y_test, cmap=plt.cm.Blues)

In [None]:
feature_importances = pd.DataFrame(model_dt.feature_importances_,
                                   index = train_data.columns,
                                    columns=['importances']).sort_values('importances', ascending=False)
print(feature_importances)

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize = (25, 10))
plot_tree(model_dt, feature_names=train_data.columns,class_names=True, impurity=False, fontsize = 10, filled = True)
plt.show()

In [None]:
prediction=model_dt.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

print ('Accuracy:', accuracy_score(y_test, prediction))
print ('F1 score:', f1_score(y_test, prediction))
print ('Recall:', recall_score(y_test, prediction))
print ('Precision:', precision_score(y_test, prediction))
print ('\n clasification report:\n', classification_report(y_test,prediction))
print ('\n confussion matrix:\n',confusion_matrix(y_test, prediction))