# CZĘŚĆ 2

## OPIS ZESTAWU DANYCH
Dane składają się z informacji o przylotach i odlotach wszystkich lotów komercyjnych w USA od października 1987 do kwietnia 2008 – przede wszystkim o ich opóźnieniach. \
Zbiór danych jest bardzo duży (120mln rekordów, 12GB danych) – na potrzeby projektu wykorzystamy jedynie dane z roku 2007 co ograniczy rozmiar przetwarzanych danych.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
import zipfile

zf = zipfile.ZipFile("./data_mow.zip")
namelist = zf.namelist()[1:]
dfs = [pd.read_csv(zf.open(f)) for f in namelist]
data = pd.concat(dfs, ignore_index=True)
data = data[data["Year"] == 2007]
with pd.option_context('display.float_format', '{:.2f}'.format, 'display.max_rows', None, 'display.max_columns', None):
    display(data.head())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2007,1,1,1,1232.0,1225,1341.0,1340,WN,2891,N351,69.0,75.0,54.0,1.0,7.0,SMF,ONT,389,4,11,0,,0,0,0,0,0,0
1,2007,1,1,1,1918.0,1905,2043.0,2035,WN,462,N370,85.0,90.0,74.0,8.0,13.0,SMF,PDX,479,5,6,0,,0,0,0,0,0,0
2,2007,1,1,1,2206.0,2130,2334.0,2300,WN,1229,N685,88.0,90.0,73.0,34.0,36.0,SMF,PDX,479,6,9,0,,0,3,0,0,0,31
3,2007,1,1,1,1230.0,1200,1356.0,1330,WN,1355,N364,86.0,90.0,75.0,26.0,30.0,SMF,PDX,479,3,8,0,,0,23,0,0,0,3
4,2007,1,1,1,831.0,830,957.0,1000,WN,2278,N480,86.0,90.0,74.0,-3.0,1.0,SMF,PDX,479,3,9,0,,0,0,0,0,0,0


In [3]:
data.drop(
    ['UniqueCarrier', 'TailNum', 'Origin', 'Dest', 'CancellationCode', 'FlightNum', 'TaxiIn', 'TaxiOut', 'DepTime',
     'ArrTime', 'ActualElapsedTime', 'AirTime', 'DepDelay', 'ArrDelay'], inplace=True, axis=1)

In [4]:
for column in ['CRSDepTime', 'CRSArrTime']:
    data[column] = data[column] // 100 + (data[column] % 100) / 60

In [5]:
with pd.option_context('display.float_format', '{:.2f}'.format, 'display.max_rows', None, 'display.max_columns', None):
    display(data.describe())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7452221.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0
mean,2007.0,6.51,15.73,3.93,13.48,15.15,127.25,719.81,0.02,0.0,3.87,0.77,3.78,0.02,5.1
std,0.0,3.43,8.78,1.99,4.66,4.82,70.36,562.31,0.15,0.05,20.84,9.62,16.18,1.08,21.28
min,2007.0,1.0,1.0,1.0,0.0,0.0,-1240.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2007.0,4.0,8.0,2.0,9.5,11.25,77.0,319.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2007.0,7.0,16.0,4.0,13.37,15.33,110.0,569.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2007.0,9.0,23.0,6.0,17.33,19.1,156.0,946.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2007.0,12.0,31.0,7.0,23.98,24.0,1430.0,4962.0,1.0,1.0,2580.0,1429.0,1386.0,382.0,1031.0


In [6]:
data["Year"] = data["Year"].astype("int")

## PROBLEMY Z DANYMI - DANE BRAKUJĄCE, NIEPRAWIDŁOWE, ODSTAJĄCE

### SPRAWDZENIE POPRAWNOŚCI TYPÓW DANYCH

In [7]:
# data = pd.read_parquet("./data_mow_2007.parquet")

In [8]:
data.dtypes

Year                   int32
Month                  int64
DayofMonth             int64
DayOfWeek              int64
CRSDepTime           float64
CRSArrTime           float64
CRSElapsedTime       float64
Distance               int64
Cancelled              int64
Diverted               int64
CarrierDelay           int64
WeatherDelay           int64
NASDelay               int64
SecurityDelay          int64
LateAircraftDelay      int64
dtype: object

Jak widać wyżej - wszystkie dane występują w poprawnym formacie.

### NAPRAWA WIERSZY Z PUSTYMI DANYMI

In [9]:
data["CRSElapsedTime"] = data["CRSElapsedTime"].map(lambda v: np.NAN if v < 0 else v)

In [10]:
data_with_nulls = data.copy()
np.random.seed(42)
data_with_nulls["Distance"] = data_with_nulls["Distance"].mask(np.random.random(data_with_nulls.shape[0]) < 0.1)
data_with_nulls["WeatherDelay"] = data_with_nulls["WeatherDelay"].mask(np.random.random(data_with_nulls.shape[0]) < 0.1)

In [11]:
np.sum(data_with_nulls.isna())

Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
CRSDepTime                0
CRSArrTime                0
CRSElapsedTime          996
Distance             746296
Cancelled                 0
Diverted                  0
CarrierDelay              0
WeatherDelay         743989
NASDelay                  0
SecurityDelay             0
LateAircraftDelay         0
dtype: int64

##### PROSTA METODA - WYPEŁNIANIE 0

In [12]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["CRSElapsedTime"].describe())

count   7452219.00
mean        127.25
std          70.36
min           1.00
25%          77.00
50%         110.00
75%         156.00
max        1430.00
Name: CRSElapsedTime, dtype: float64

In [13]:
data_with_nulls["CRSElapsedTime"].fillna(0, inplace=True)

In [14]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["CRSElapsedTime"].describe())

count   7453215.00
mean        127.23
std          70.37
min           0.00
25%          77.00
50%         110.00
75%         156.00
max        1430.00
Name: CRSElapsedTime, dtype: float64

##### ROZKŁAD ZMIENNEJ

In [15]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data["Distance"].describe())

count   7453215.00
mean        719.81
std         562.31
min          11.00
25%         319.00
50%         569.00
75%         946.00
max        4962.00
Name: Distance, dtype: float64

In [16]:
from scipy.stats import gaussian_kde

# estimate density
density = gaussian_kde(data["Distance"])

# use density to imput missing values
data_with_nulls["Distance"].fillna(density.resample(1)[0][0], inplace=True)

In [17]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["Distance"].describe())

count   7453215.00
mean        893.01
std         744.25
min          11.00
25%         337.00
50%         622.00
75%        1127.00
max        4962.00
Name: Distance, dtype: float64

##### Regresja

In [18]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["WeatherDelay"].describe())

count   6709226.00
mean          0.77
std           9.57
min           0.00
25%           0.00
50%           0.00
75%           0.00
max        1429.00
Name: WeatherDelay, dtype: float64

In [19]:
# build regression model for WeatherDelay
from sklearn.linear_model import LinearRegression

columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
           'CRSElapsedTime', 'Distance', 'Cancelled', 'Diverted', 'CarrierDelay',
           'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

model = LinearRegression()
model = model.fit(data_with_nulls.dropna()[columns], data_with_nulls["WeatherDelay"].dropna())

In [20]:
# impute missing values using regression model
# for each row with missing value in WeatherDelay
data_with_nulls.loc[data_with_nulls["WeatherDelay"].isna(), "WeatherDelay"] = model.predict(
    data_with_nulls[data_with_nulls["WeatherDelay"].isna()][columns])

In [21]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["WeatherDelay"].describe())

count   7453215.00
mean          0.77
std           9.08
min          -6.00
25%           0.00
50%           0.00
75%           0.00
max        1429.00
Name: WeatherDelay, dtype: float64

In [22]:
# if value less than 0, set to 0
data_with_nulls["WeatherDelay"] = data_with_nulls["WeatherDelay"].map(lambda v: 0 if v < 0 else v)

In [23]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(data_with_nulls["WeatherDelay"].describe())

count   7453215.00
mean          0.77
std           9.08
min           0.00
25%           0.00
50%           0.00
75%           0.00
max        1429.00
Name: WeatherDelay, dtype: float64

#### SKALOWANIE DANYCH

In [24]:
data_to_scale = data_with_nulls.copy()

data_minmax = data_to_scale.copy()
columns = [column for column in data_minmax.columns if column != 'Cancelled']
for column in columns:
    data_minmax[column] = (data_minmax[column] - data_minmax[column].min()) / (
                data_minmax[column].max() - data_minmax[column].min())

In [25]:
with pd.option_context('display.float_format', '{:.2f}'.format, 'display.max_rows', None, 'display.max_columns', None):
    display(data_minmax.describe())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,0.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0
mean,,0.5,0.49,0.49,0.56,0.63,0.09,0.18,0.02,0.0,0.0,0.0,0.0,0.0,0.0
std,,0.31,0.29,0.33,0.19,0.2,0.05,0.15,0.15,0.05,0.01,0.01,0.01,0.0,0.02
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.27,0.23,0.17,0.4,0.47,0.05,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,0.55,0.5,0.5,0.56,0.64,0.08,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,0.73,0.73,0.83,0.72,0.8,0.11,0.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
data_standard = data_to_scale.copy()
columns = [column for column in data_minmax.columns if column != 'Cancelled']
for column in columns:
    data_standard[column] = (data_standard[column] - data_standard[column].mean()) / data_standard[column].std()

In [27]:
with pd.option_context('display.float_format', '{:.2f}'.format, 'display.max_rows', None, 'display.max_columns', None):
    display(data_standard.describe())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,0.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0,7453215.0
mean,,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.02,0.0,-0.0,0.0,-0.0,-0.0,0.0
std,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.15,1.0,1.0,1.0,1.0,1.0,1.0
min,,-1.61,-1.68,-1.47,-2.89,-3.14,-1.81,-1.19,0.0,-0.05,-0.19,-0.08,-0.23,-0.02,-0.24
25%,,-0.73,-0.88,-0.97,-0.85,-0.81,-0.71,-0.75,0.0,-0.05,-0.19,-0.08,-0.23,-0.02,-0.24
50%,,0.14,0.03,0.03,-0.02,0.04,-0.24,-0.36,0.0,-0.05,-0.19,-0.08,-0.23,-0.02,-0.24
75%,,0.73,0.83,1.04,0.83,0.82,0.41,0.31,0.0,-0.05,-0.19,-0.08,-0.23,-0.02,-0.24
max,,1.6,1.74,1.54,2.25,1.84,18.51,5.47,1.0,20.81,123.6,157.32,85.44,352.05,48.22


In [28]:
# data_standard.to_parquet("./data_mow_2007_standard.parquet")

In [29]:
# data_minmax.to_parquet("./data_mow_2007_minmax.parquet")

### PODZIAŁ DANYCH

In [30]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

columns =['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
       'CRSElapsedTime', 'Distance', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled']

def train_test_split(X, Y, test_size):
    train_pct_index = int((1 - test_size) * len(X))
    train_pct_indexY = int((1 - 0.20) * len(Y))

    trainDF_x = pd.DataFrame(X[:train_pct_index], columns = columns)
    trainDF_y = pd.DataFrame(Y[:train_pct_indexY], columns = columns)
    trainDF = trainDF_x.append(trainDF_y)
    trainDF = trainDF.sample(frac=1, random_state=42)
    trainDF = trainDF.to_numpy()

    testDF_x = pd.DataFrame(X[train_pct_index:], columns = columns)
    testDF_y = pd.DataFrame(Y[train_pct_indexY:], columns = columns)
    testDF = testDF_x.append(testDF_y)
    testDF = testDF.sample(frac=1, random_state=42)
    testDF = testDF.to_numpy()

    return trainDF[:, :-1], testDF[:, :-1], trainDF[:, -1:], testDF[:, -1:]



def splitOfData(dataInput, testSize):
    cancelled = dataInput['Cancelled']==1
    dataCancelled = dataInput[cancelled]
    dataNotCancelled = dataInput[~cancelled]
    return train_test_split(dataNotCancelled, dataCancelled, test_size=testSize)

data_for_training_standard = data_standard[['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime','CRSElapsedTime', 'Distance', 'Diverted', 'CarrierDelay','WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled']]
data_for_training_minmax = data_minmax[['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime','CRSElapsedTime', 'Distance', 'Diverted', 'CarrierDelay','WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled']]

x_train_standard, x_test_standard, y_train_standard, y_test_standard = splitOfData(data_for_training_standard, 0.2)
x_train_minmax, x_test_minmax, y_train_minmax, y_test_minmax = splitOfData(data_for_training_minmax, 0.2)

data_sets = [("standarized", x_train_standard, y_train_standard, x_test_standard, y_test_standard), ("min-max", x_train_minmax, y_train_minmax, x_test_minmax, y_test_minmax)]
print("")
print("x_train_standard: ", x_train_standard.shape)
print("x_test_standard: ", x_test_standard.shape)
print("y_train_standard: ", y_train_standard.shape)
print("y_test_standard: ", y_test_standard.shape)

print("")
print("x_train_minmax.shape: ", x_train_minmax.shape)
print("x_test_minmax.shape: ", x_test_minmax.shape)
print("y_train_minmax.shape: ", y_train_minmax.shape)
print("y_test_minmax.shape: ", y_test_minmax.shape)



x_train_standard:  (5962571, 13)
x_test_standard:  (1490644, 13)
y_train_standard:  (5962571, 1)
y_test_standard:  (1490644, 1)

x_train_minmax.shape:  (5962571, 13)
x_test_minmax.shape:  (1490644, 13)
y_train_minmax.shape:  (5962571, 1)
y_test_minmax.shape:  (1490644, 1)


### MIARY SKUTECZNOŚCI

In [31]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def effectiveness_measures_classifier(y_test, predictions):
    acc=accuracy_score(y_test, predictions)
    print("Prediction Accuracy: ", acc)

    f1=f1_score(y_test, predictions)
    print("F1: ", f1)

    precision=precision_score(y_test, predictions)
    print("Precision: ", precision)

    recall=recall_score(y_test, predictions)
    print("Recall: ", recall)

    return acc, f1, precision, recall


### DECISION TREE CLASSIFIER

In [32]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier

eff_dtc = []

for name, x_train, y_train, x_test, y_test in data_sets:
    dtc = DecisionTreeClassifier(max_depth=10, class_weight="balanced").fit(x_train, y_train)
    pred = dtc.predict(x_test)
    print(f"Decision Tree Classifier ({name} data):")
    # ConfusionMatrixDisplay.from_predictions(y_test,pred, normalize='true')
    eff_dtc.append(effectiveness_measures_classifier(y_test, pred))
    print("")

Decision Tree Classifier (standarized data):
Prediction Accuracy:  0.8353979890570787
F1:  0.07156884632412204
Precision:  0.04074062905219988
Recall:  0.2941524105754277

Decision Tree Classifier (min-max data):
Prediction Accuracy:  0.8298507222381736
F1:  0.07082933405625609
Precision:  0.040142681549398714
Recall:  0.30068429237947125



### GAUSSIAN NB

In [33]:
from sklearn.naive_bayes import GaussianNB
from numpy import ravel

eff_gnb = []

for name, x_train, y_train, x_test, y_test in data_sets:
    y_train_ravel=ravel(y_train)
    gnb = GaussianNB().fit(x_train, y_train_ravel)
    pred = gnb.predict(x_test)
    print(f"Gaussian Naive Bayes ({name} data):")
    # ConfusionMatrixDisplay.from_predictions(y_test,pred, normalize='true')
    eff_gnb.append(effectiveness_measures_classifier(y_test, pred))
    print("")

Gaussian Naive Bayes (standarized data):
Prediction Accuracy:  0.2995805839623679
F1:  0.05800940478467518
Precision:  0.02987116041114194
Recall:  0.9999377916018662

Gaussian Naive Bayes (min-max data):
Prediction Accuracy:  0.30754023093374405
F1:  0.05863710856383043
Precision:  0.03020415142663599
Recall:  0.9999377916018662



### LOGISTIC REGRESSION

In [34]:
from sklearn.linear_model import LogisticRegression
from numpy import ravel

eff_lr = []

for name, x_train, y_train, x_test, y_test in data_sets:
    y_train_ravel=ravel(y_train)
    lr = LogisticRegression(random_state=42, max_iter=5000, class_weight="balanced").fit(x_train, y_train_ravel)
    pred = lr.predict(x_test)
    print(f"Logistic Regression ({name} data):")
    # ConfusionMatrixDisplay.from_predictions(y_test,pred, normalize='true')
    eff_lr.append(effectiveness_measures_classifier(y_test, pred))
    print("")

### RANDOM FOREST

In [35]:
from sklearn.ensemble import RandomForestClassifier
from numpy import ravel

eff_rfc = []

for name, x_train, y_train, x_test, y_test in data_sets:
    y_train_ravel=ravel(y_train)
    rfc = RandomForestClassifier(max_depth=10, random_state=42, class_weight="balanced", n_estimators=12, n_jobs=-1).fit(x_train, y_train_ravel)
    pred = rfc.predict(x_test)
    print(f"Random Forest Classifier ({name} data):")
    # ConfusionMatrixDisplay.from_predictions(y_test,pred, normalize='true')
    eff_rfc.append(effectiveness_measures_classifier(y_test, pred))
    print("")

## WYNIKI

In [36]:
from prettytable import PrettyTable


def add_results(table, normalization_type, model, results):
    (acc, f1, prec, recall)= results
    table.add_row([normalization_type, model, acc, f1, prec, recall])

def create_summary():
    summaryTable = PrettyTable(['normalization type', 'model', 'Accuracy', 'F1', 'Precision', 'Recall'])
    add_results(summaryTable, "standard", "Decision Tree Classifier", eff_dtc[0])
    add_results(summaryTable, "min-max", "Decision Tree Classifier", eff_dtc[1])
    add_results(summaryTable, "standard", "Gaussian Naive Bayes", eff_gnb[0])
    add_results(summaryTable, "min-max", "Gaussian Naive Bayes", eff_gnb[1])
    add_results(summaryTable, "standard", "Logistic Regression", eff_lr[0])
    add_results(summaryTable, "min-max", "Logistic Regression", eff_lr[1])
    add_results(summaryTable, "standard", "Random Forest Classifier", eff_rfc[0])
    add_results(summaryTable, "min-max", "Random Forest Classifier", eff_rfc[1])
    print(summaryTable)


In [37]:
create_summary()

+--------------------+--------------------------+---------------------+---------------------+----------------------+---------------------+
| normalization type |          model           |       Accuracy      |          F1         |      Precision       |        Recall       |
+--------------------+--------------------------+---------------------+---------------------+----------------------+---------------------+
|      standard      | Decision Tree Classifier |  0.8353979890570787 | 0.07156884632412204 | 0.04074062905219988  |  0.2941524105754277 |
|      min-max       | Decision Tree Classifier |  0.8298507222381736 | 0.07082933405625609 | 0.040142681549398714 | 0.30068429237947125 |
|      standard      |   Gaussian Naive Bayes   |  0.2995805839623679 | 0.05800940478467518 | 0.02987116041114194  |  0.9999377916018662 |
|      min-max       |   Gaussian Naive Bayes   | 0.30754023093374405 | 0.05863710856383043 | 0.03020415142663599  |  0.9999377916018662 |
+--------------------+-----

### STRATIFIED CROSS VALIDATION

In [38]:
def cross_validation_split(dataInput):
    cancelled = dataInput['Cancelled']==1
    dataCancelled = dataInput[cancelled]
    dataNotCancelled = dataInput[~cancelled]
    DF_x = pd.DataFrame(dataCancelled[:], columns = columns)
    DF_y = pd.DataFrame(dataNotCancelled[:], columns = columns)
    finalDF = DF_x.append(DF_y)
    finalDF = finalDF.sample(frac=1, random_state=42)
    finalDF = finalDF.to_numpy()
    return finalDF[:, :-1], finalDF[:, -1:]


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
def mp(test, predicted):
    tn=0
    tp=0
    fn=0
    fp=0
    for i in range(len(test)):
        if test[i] == predicted[i]:
            if test[i] == 0:
                tn += 1
            else:
                tp += 1
        else:
            if test[i] == 0:
                fn += 1
            else:
                fp += 1
    return tp, tn, fn, fp

def f1(tp, tn, fn, fp):
    return 2*(2*tp/(2*tp+fp+fn))/2

In [43]:
def cross_validation(df, scaler, k, classifier):
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    sum_f1_score = 0
    mod = len(df) % k
    if(mod != 0):
        df.drop(df.tail(mod).index,inplace=True)
    n = int(len(df) / k)
    resultSum = 0
    for i in range(k):
        test = df[i*n:(i+1)*n + 1]
        x_test, y_test = cross_validation_split(test)
        train = pd.concat([df[:i * n], df[(i+1) * n + 1:]])
        x_train, y_train = cross_validation_split(train)
        std = scaler.fit(x_train)
        x_train = std.transform(x_train)
        x_test = std.transform(x_test)
        if classifier!= DecisionTreeClassifier(max_depth=10, class_weight="balanced"):
            y_train= ravel(y_train)
        classifier.fit(x_train,y_train)
        predictions = classifier.predict(x_test)
        sum_f1_score += f1_score(y_test, predictions)
        tp1, tn1, fn1, fp1 = mp(y_test, predictions)
        tp += tp1
        tn += tn1
        fn += fn1
        fp += fp1
        resultSum += classifier.score(x_test, y_test)
    print("Classification F1 - own implementation: " +  str(f1(tp, tn, fn, fp)))
    print("Classification F1 - build function: " +  str(sum_f1_score/k)))
    return resultSum/k

In [43]:
from sklearn import preprocessing


def create_summary_cross_validation(n):
    summaryTable = PrettyTable(['scaler', 'model', 'K folds', 'cross validation score'])
    summaryTable.add_row(["standard", "Decision Tree Classifier", n,  cross_validation(data_with_nulls, preprocessing.StandardScaler(), n, DecisionTreeClassifier(max_depth=10, class_weight="balanced"))])
    summaryTable.add_row(["min-max", "Decision Tree Classifier", n,  cross_validation(data_with_nulls, preprocessing.MinMaxScaler(), n, DecisionTreeClassifier(max_depth=10, class_weight="balanced"))])
    summaryTable.add_row(["standard", "Gaussian Naive Bayes", n, cross_validation(data_with_nulls, preprocessing.StandardScaler(), n,  GaussianNB())])
    summaryTable.add_row(["min-max", "Gaussian Naive Bayes", n, cross_validation(data_with_nulls, preprocessing.MinMaxScaler(), n,  GaussianNB())])
    print(summaryTable)

In [None]:
create_summary_cross_validation(5)

In [44]:
 #just to verify if cross_validation works
 from sklearn import preprocessing
 cross_validation(data_with_nulls, preprocessing.StandardScaler(), 5,  DecisionTreeClassifier(max_depth=10, class_weight="balanced"))

0.7182504797264018