In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import missingno as msno
from sklearn import metrics

from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedKFold

df_regression = pd.read_csv("regression_df.csv", index_col="timestamp")
df_ln_regression = pd.read_csv("ln_regression_df.csv", index_col="timestamp") 

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

def get_tpot_pipe_regression():
    
    X = df_regression.iloc[:, :-1]
    y = df_regression.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70, shuffle=False)
    
    
    tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2,
                          random_state=42, cv=cv, n_jobs=-1, config_dict="TPOT light")
    tpot.fit(X_train, y_train)

    y_pred = tpot.predict(X_test)
    

    mae = metrics.mean_absolute_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)

    print("The model performance for testing set")
    print("--------------------------------------")
    print('MAE is {}'.format(mae))
    print('MSE is {}'.format(mse))
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    
    print("TPOT:")
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipe_regression.py')

def get_tpot_pipe_ln_regression():
    
    X = df_ln_regression.iloc[:, :-1]
    y = df_ln_regression.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70, shuffle=False)
    
    
    tpot = TPOTRegressor(generations=5, population_size=20,
                          verbosity=2, random_state=42, 
                          cv=cv, n_jobs=-1, config_dict="TPOT light")
    
    tpot.fit(X_train, y_train)

    y_pred = tpot.predict(X_test)

    mae = metrics.mean_absolute_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)

    print("The model performance for testing set")
    print("--------------------------------------")
    print('MAE is {}'.format(mae))
    print('MSE is {}'.format(mse))
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    
    print("TPOT:")
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipe_ln_regression.py')




In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import missingno as msno

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")


from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

df_classification_binary_4_days = pd.read_csv("binary_danger_window_4_days_df.csv", index_col="timestamp")

X = df_classification_binary_4_days.iloc[:, :-1]
y = df_classification_binary_4_days.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70, shuffle=False)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

def get_tpot_pipe_binary_4(scoring):
    tpot = TPOTClassifier(generations=5, population_size=20,
                          verbosity=2, random_state=42, scoring=scoring, 
                          cv=cv, n_jobs=-1)
    
    tpot.fit(X_train, y_train)

    y_pred = tpot.predict(X_test)

    print("Optimized Metric: " + scoring)
    confusion_mat = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(confusion_mat)
    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:",)
    print (classification_rep)
    print("TPOT:")
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipe_binary_4_%s.py'%(scoring))
    
# try:
#     get_tpot_pipe_binary_4("precision")
# except:
#     pass
# try:
#     get_tpot_pipe_binary_4("f1")
# except:
#     pass
# try:
#     get_tpot_pipe_binary_4("accuracy")
# except:
#     pass

In [3]:
warnings_not_danger =(202) / (202 + 127690) * 1440 * 4
warnings_danger = 679 / (25511+679) * 1440 * 4

print(warnings_not_danger, warnings_danger)


9.097676164263598 149.33333333333334


In [4]:
# series = pd.Series(y_test) * 1
# series.plot()

In [5]:
# series = pd.Series(y_train) * 1
# series.plot()

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import missingno as msno

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")


from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

df_classification_binary_4_days = pd.read_csv("binary_danger_window_4_days_df.csv", index_col="timestamp")

X = df_classification_binary_4_days.iloc[:, :-1]
y = df_classification_binary_4_days.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.70, shuffle=False)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

def get_tpot_pipe_binary_4_70train(scoring):
    tpot = TPOTClassifier(generations=5, population_size=20,
                          verbosity=3, random_state=42, scoring=scoring, 
                          cv=cv, n_jobs=-1)
    
    tpot.fit(X_train, y_train)

    y_pred = tpot.predict(X_test)

    print("Optimized Metric: " + scoring)
    confusion_mat = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(confusion_mat)
    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:",)
    print (classification_rep)
    print("TPOT:")
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipe_binary_4_%s_70train.py'%(scoring))
    
try:
    get_tpot_pipe_binary_4_70train("f1")
except:
    pass
try:
    get_tpot_pipe_binary_4_70train("accuracy")
except:
    pass



32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]

Skipped pipeline #4 due to time out. Continuing to the next pipeline.
Skipped pipeline #6 due to time out. Continuing to the next pipeline.
Skipped pipeline #8 due to time out. Continuing to the next pipeline.
Skipped pipeline #10 due to time out. Continuing to the next pipeline.
Skipped pipeline #13 due to time out. Continuing to the next pipeline.
Skipped pipeline #15 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
Skipped pipeline #19 due to time out. Continuing to the next pipeline.
Skipped pipeline #21 due to time out. Continuing to the next pipeline.
Skipped pipeline #23 due to time out. Continuing to the next pipeline.
Skipped pipeline #25 due to time out. Continuing to the next pipeline.
Skipped pipeline #28 due to time out. Continuing to the next pipeline.
Skipped pipeline #30 due to time out. Continuing to the next pipeline.
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 fe

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.


In [9]:
1440 * 679/(25511+679)

37.333333333333336