In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import seaborn as sns

import time
import rtl_func

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')






In [28]:
def best_params_choice(RTL_frame, label_frame, RTL_grid=RTL_grid, labels_grid=labels_grid):
    best_result = 0
    best_params = []
    for n_i,i in enumerate(label_frame.columns):
        for n_j, j in enumerate(RTL_frame.columns):
            if f1_score(y[i],RTL[j]) > best_result:
                best_params = (n_i, n_j)
                best_result = f1_score(y[i],RTL[j])
                best_prec = precision_score(y[i],RTL[j])
                best_rec = recall_score(y[i],RTL[j])
                
    print "The best F1 score:", best_result 
    print "the corresponding Precision score: %f, Recall score: %f" %(best_prec, best_rec)
    print
    print "The best RTL Params:"
    print "r0: %i" %(RTL_grid[n_j][0])
    print "t0: %i" %(RTL_grid[n_j][1])
    print 
    print "The most successful Label Params:"
    print "M_c: %i" %(labels_grid[n_i][0])
    print "r_c: %i" %(labels_grid[n_i][1])
    print "delta_c: %i" %(labels_grid[n_i][2])
    print "t_c: %i" %(labels_grid[n_i][3])

    

In [19]:
data = pd.read_csv('rtl_on_grid.csv',delimiter=",")
y = pd.read_csv("labels.csv")
y.drop(["class","diff_in_days_before_first_eqarthquake"],axis = 1, inplace = True)

In [20]:
r0 = [10,25,50]
t0 = [30,90,180,365,730]
RTL_grid = rtl_func.make_grid((r0,t0))
RTL_names = []
for params in RTL_grid:
    r_name = "r_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    t_name = "t_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    l_name = "l_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    RTL_names.append((r_name,t_name,l_name))
    


In [21]:
RTL_ind = []
for x,param in enumerate(RTL_names):
    data["RTL"+str(x)] = data[param[0]].values * data[param[1]].values * data[param[2]].values
    #data["RTL"+str(x)] = data["RTL"+str(x)].values - pd.rolling_mean(data["RTL"+str(x)].values, 5)
    RTL_ind.append(("RTL"+str(x)))
RTL = rtl_func.make_anomaly_on_q_level(data[RTL_ind].copy(), y, 0.95, 0.05)

Find best RTL score and write corresponding params

In [29]:
best_params_choice(RTL, y)

The best F1 score: 0.574647530068
the corresponding Precision score: 0.786313, Recall score: 0.452768

The best RTL Params:
r0: 50
t0: 730

The most successful Label Params:
M_c: 6
r_c: 200
delta_c: 90
t_c: 365


Create two additional features using "OR" law and "Major choice" law

In [31]:
RTL["major_RTL"] = RTL.sum(axis=1) 
RTL["major_RTL"] = (RTL["major_RTL"].astype(float) / 15).astype(int)
RTL["or_RTL"] = RTL.sum(axis=1) 
RTL["or_RTL"] = (RTL["or_RTL"].astype(float) > 0).astype(int)

Print Scores for "major_RTL" and "or_RTL"

In [35]:
best_params_choice(pd.DataFrame(RTL["major_RTL"]), y)
print
best_params_choice(pd.DataFrame(RTL["or_RTL"]), y)

The best F1 score: 0.575098591549
the corresponding Precision score: 0.889013, Recall score: 0.425022

The best RTL Params:
r0: 10
t0: 30

The most successful Label Params:
M_c: 6
r_c: 200
delta_c: 90
t_c: 365

The best F1 score: 0.548810334358
the corresponding Precision score: 0.613133, Recall score: 0.496702

The best RTL Params:
r0: 10
t0: 30

The most successful Label Params:
M_c: 6
r_c: 200
delta_c: 90
t_c: 365
