In [1]:
from rtl_func import *
import numpy as np
import pandas as pd 

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

import seaborn as sns

import time

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings('ignore')


In [2]:
data = pd.read_csv('test_features.csv',delimiter=",")
y = pd.read_csv("labels.csv")
data.drop(["lat","l_r","long","class","diff_in_days_before_first_eqarthquake"],axis = 1, inplace = True)

make list RTL_names with names of grid parameters

In [3]:
r0 = [10,25,50,100]
t0 = [30,90,180,365]
grid_params = make_grid((r0,t0))
RTL_names = []

for params in grid_params:
    r_name = "r_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    t_name = "t_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    l_name = "l_where:_r0=" + str(params[0]) + "_t0=" + str(params[1])
    RTL_names.append((r_name,t_name,l_name))

compute the values of RTL's
RTL_ind - list with names of RTL's

In [4]:
RTL_ind = []

for x,param in enumerate(RTL_names):
    data["RTL"+str(x)] = data[param[0]].values * data[param[1]].values * data[param[2]].values
    #data["RTL"+str(x)] = data["RTL"+str(x)].values - pd.rolling_mean(data["RTL"+str(x)].values, 5)
    RTL_ind.append(("RTL"+str(x)))

transform RTL statistics to {0,1}-features using 5% quantiles 

In [16]:
RTL = make_anomaly_on_q_level(data[RTL_ind].copy(), y, 0.9, 0.00)

In [6]:
best_y = y["index_where:_m_c=5.0_r_c=50.0_delta_c=10.0_t_c180.0"]

make grid for labels 

In [7]:
m_c = [5.,6.]
r_c = [50.,100.,200.]
delta_c = [10.,30.,90.]
t_c = [180.,365.]

labels_grid = make_grid((m_c,r_c,delta_c,t_c))

find the best values for label and RTL

In [8]:
best_result = 0 

for i in y.columns:
    for j in RTL.columns:
        if roc_auc_score(y[i],RTL[j]) > best_result:
            best_params = (i, j) 
            best_result = f1_score(y[i],RTL[j])
        


In [9]:
print best_params[0], labels_grid[6]

y = y[best_params[0]]
y_pred = RTL[best_params[1]]
    
print f1_score(y, y_pred)
print precision_score(y, y_pred), recall_score(y, y_pred) 

index_where:_m_c=5.0_r_c=50.0_delta_c=10.0_t_c180.0 [   5.  100.   10.  180.]
0.636825768668
0.689384525651 0.591713464803


Now train models using all RTL's and the best label grid

In [12]:
models = []

logres_parameters = [{"penalty":["l2"],
                      "C":[1]}]
models.append(["LogRegression", LogisticRegression(), logres_parameters])

GB_parameters =[{"max_depth":[5],
                  "n_estimators":[100]}]

models.append(["GBClassifier", GradientBoostingClassifier(), GB_parameters])

rf_parameters = [{"n_estimators": [10,100]}]
models.append(["RandomForest",
                    RandomForestClassifier(),
                    rf_parameters])

In [17]:
best_scores = []
best_params = []

for name, model, parameters in models:
    for params in parameters:
        estimator = model
        cv = KFold(n_splits=4, shuffle=True, random_state=13)
        clf = GridSearchCV(estimator, param_grid=params,scoring = 'f1', n_jobs=6, cv=cv)
        clf.fit(RTL, best_y)
        best_scores.append(clf.best_score_)
        best_params.append(clf.best_params_)
        
        print(name, clf.best_score_, clf.best_params_)

('LogRegression', 0.63712284450718482, {'penalty': 'l2', 'C': 1})
('GBClassifier', 0.66819158279805457, {'n_estimators': 100, 'max_depth': 5})
('RandomForest', 0.66791207092191651, {'n_estimators': 100})


In [9]:
best_scores = []
best_params = []

for name, model, parameters in models:
    for params in parameters:
        estimator = model
        cv = KFold(n_splits=4, shuffle=True, random_state=13)
        clf = GridSearchCV(estimator, param_grid=params,scoring = 'f1', n_jobs=6, cv=cv)
        clf.fit(RTL, best_y)
        best_scores.append(clf.best_score_)
        best_params.append(clf.best_params_)
        
        print(name, clf.best_score_, clf.best_params_)

('GBClassifier', 0.65573670534552242, {'n_estimators': 100, 'max_depth': 5})
('RandomForest', 0.65684801160735296, {'n_estimators': 10})


In [10]:
best_scores = []
best_params = []

for name, model, parameters in models:
    for params in parameters:
        estimator = model
        cv = KFold(n_splits=4, shuffle=True, random_state=13)
        clf = GridSearchCV(estimator, param_grid=params,scoring = 'f1', n_jobs=6, cv=cv)
        clf.fit(data[RTL_ind], best_y)
        best_scores.append(clf.best_score_)
        best_params.append(clf.best_params_)
        
        print(name, clf.best_score_, clf.best_params_)

('GBClassifier', 0.72520882974290057, {'n_estimators': 100, 'max_depth': 5})
('RandomForest', 0.82368754566453284, {'n_estimators': 100})


In [None]:
rf = RandomForestClassifier(n_estimators = 10)
rf.fit(RTL,best_y)