In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
import tqdm

import pickle 
%matplotlib inline

In [2]:
def aggregate_data(data_path:str,save_path:str,file_name:str)-> pd.DataFrame:
    """
    Aggregate data in every segment for every sensor
    After this aggregation we get one value for every sensor in segment

    Parameters:
        data_path
        save_path
        file_name
    Returns:
        pd.DataFrame 
    """
    
    df_dict = {}
    for filename in glob.glob(data_path):
        file = pd.read_csv(filename).fillna(0)
        ind = filename.split('/')[1].split('.')[0]
        features = file.sum().add_suffix('_sum')
        features = features.append(file.mean().add_suffix('_mean'))
        features = features.append(file.std().add_suffix('_std'))
        features = features.append(file.max().add_suffix('_max'))
        features = features.append(file.min().add_suffix('_min'))
        features = features.append(file.skew().add_suffix('_skew'))
        features = features.append(file.kurtosis().add_suffix('_kurtosis'))
        df_dict[ind] = features
    df = pd.DataFrame(df_dict).transpose()
    df.to_csv(save_path+file_name)

def quantile_data(data_path,save_path,file_name):
     """
    Aggregate data in every segment for every sensor
    After this aggregation we get one quantile value for every sensor in segment

    Parameters:
        data_path
        save_path
        file_name
    Returns:
        pd.DataFrame 
    """
    df_dict = {}
    for filename in tqdm.tqdm(list(glob.glob(data_path))):
        file = pd.read_csv(filename).fillna(0)
        ind = filename.split('/')[1].split('.')[0]
        features = file.quantile(.8).add_suffix('_0.8')
        features = features.append(file.quantile(.9).add_suffix('_0.9'))
        features = features.append(file.quantile(.99).add_suffix('_0.99'))   
        df_dict[ind] = features
    df = pd.DataFrame(df_dict).transpose()
    df.to_csv(save_path+file_name)
    

def split_train_test_with_sklearn(df:pd.DataFrame)->pd.DataFrame:
     """
    Prepare train and test df

    Parameters:
        df
    Returns:
        train
        test
    """
    train, test  = train_test_split(df,test_size=0.33,random_state=42)
    return train, test

def prepare_for_modeling(df):
     """
    Prepare data for modeling. Split df for two df: features and target

    Parameters:
        df-data frame
    Returns:
        X-features
        y-target
    """
    y = df['time_to_eruption']
    #y = pd.factorize(y)[0]
    X = df.drop(['time_to_eruption'],axis=1)
    return X,y

https://www.kaggle.com/jesperdramsch/introduction-to-volcanology-seismograms-and-lgbm

http://eqseis.geosc.psu.edu/cammon/HTML/Classes/IntroQuakes/Notes/seismometers.html

# Aggregate data and calculate percentile

In [None]:
#aggregate_data('train/*.csv','train/','aggredated_train.csv')
#aggregate_data('test/*.csv','test/','aggredated_test.csv')

#quantile_data('train/*.csv','train/','quantiled_train.csv')
#quantile_data('test/*.csv','test/','quantiled_test.csv')

# Read train data

In [3]:
# read dfs with aggregated data and merge to one df
time = pd.read_csv('train.csv')
df1 = pd.read_csv('aggregated_train.csv')
df2 = pd.read_csv('quantiled_train.csv')
df1 = df1.rename(columns = {'Unnamed: 0':'segment_id'}) 
df2 = df2.rename(columns = {'Unnamed: 0':'segment_id'})# 'segment_id' object-> float64
df = pd.merge(time,df1, on=['segment_id'])
df = pd.merge(df,df2,on='segment_id')
df = df.set_index('segment_id')
df.head()

Unnamed: 0_level_0,time_to_eruption,sensor_1_sum,sensor_2_sum,sensor_3_sum,sensor_4_sum,sensor_5_sum,sensor_6_sum,sensor_7_sum,sensor_8_sum,sensor_9_sum,...,sensor_1_0.99,sensor_2_0.99,sensor_3_0.99,sensor_4_0.99,sensor_5_0.99,sensor_6_0.99,sensor_7_0.99,sensor_8_0.99,sensor_9_0.99,sensor_10_0.99
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1136037770,12262005,-96621.0,276834.0,213587.0,121201.0,0.0,-583141.0,423609.0,76103.0,87904.0,...,704.0,1755.0,733.0,904.0,0.0,1518.0,1247.0,1420.0,899.0,1375.0
1969647810,32739612,85569.0,149069.0,-167659.0,-102036.0,43927.0,-538513.0,352219.0,-65211.0,188411.0,...,1108.0,1740.0,902.0,904.0,737.0,845.0,1207.0,846.0,1022.0,2494.0
1895879680,14965999,150278.0,326988.0,-95314.0,-69051.0,0.0,-799715.0,-117460.0,-246701.0,-110002.0,...,613.0,2240.0,527.0,711.0,0.0,1037.0,828.0,774.0,646.0,1231.0
2068207140,26469720,129950.0,-22403.0,-161942.0,-79013.0,18528.0,-50214.0,-50589.0,-10519.0,-201797.0,...,556.0,1116.0,467.0,579.0,424.0,732.0,716.0,766.0,563.0,1339.0
192955606,31072429,4429.0,0.0,119935.0,14705.0,-26483.0,132341.0,-287066.0,-103821.0,97376.0,...,636.0,0.0,553.0,635.0,469.0,1007.0,691.0,940.0,682.0,1334.0


In [9]:
df.shape

(4431, 101)

In [6]:
df.columns

Index(['time_to_eruption', 'sensor_1_sum', 'sensor_2_sum', 'sensor_3_sum',
       'sensor_4_sum', 'sensor_5_sum', 'sensor_6_sum', 'sensor_7_sum',
       'sensor_8_sum', 'sensor_9_sum',
       ...
       'sensor_1_0.99', 'sensor_2_0.99', 'sensor_3_0.99', 'sensor_4_0.99',
       'sensor_5_0.99', 'sensor_6_0.99', 'sensor_7_0.99', 'sensor_8_0.99',
       'sensor_9_0.99', 'sensor_10_0.99'],
      dtype='object', length=101)

In [None]:
# code to sum all f.exp.sum/mean for every columns
#column_name_for_sum = set([col.split('_')[-1] for col in df.columns if col != 'time_to_eruption'])
#for col_name in column_name_for_sum:
        #df[col_name] = df[[col for col in df.columns if col.endswith('_{}'.format(col_name))]].mean(axis=1)
    


# Train model with feature selection or without

In [10]:
# split train/test
train,test = split_train_test_with_sklearn(df)
print('Train shape is {}, test size is {}'.format(train.shape,test.shape))

X_train,y_train = prepare_for_modeling(train)
X_test,y_test = prepare_for_modeling(test)

# list of models
lin_reg = LinearRegression()
lasso = Lasso(alpha = 0.5)
dec_tree = DecisionTreeRegressor()
ran_for = RandomForestRegressor()
xgb_meta = XGBRegressor(colsample_bytree=0.4,
                             gamma=0,
                            learning_rate=0.07,
                            max_depth=3,
                            min_child_weight=1.5,
                            n_estimators=1000,
                            reg_alpha=0.75,
                            reg_lambda=0.45,
                            subsample=0.6,
                            seed=2)
models = [lin_reg,lasso,dec_tree,ran_for,xgb_meta ]

results = []
model_res = {}
for model in models:
    clf = Pipeline([('feature_selection', SelectFromModel(RandomForestRegressor(),threshold='median')),
                      ('classification', model)
                    ])
    clf.fit(X_train,y_train)
    #pred for train
    y_pred_train = clf.predict(X_train)
    MSA_train = mean_absolute_error(y_train,y_pred_train)

    # pred for test
    y_pred = clf.predict(X_test)
    MSA_test = mean_absolute_error(y_test,y_pred)
    model_res[str(model).split('(')[0]] = [int(MSA_train),int(MSA_test)]

    


  positive)


In [11]:
pd.DataFrame.from_dict(model_res, orient='index',
                       columns=['MAE train with feature importance','MAE test with feature importance'])

Unnamed: 0,MAE train with feature importance,MAE test with feature importance
LinearRegression,9893229,10584854
Lasso,9953408,10541933
DecisionTreeRegressor,0,4967707
RandomForestRegressor,1635996,4389219
XGBRegressor,2140012,4863029


Linear Regression and Lasso have big Mean Absolute Error on train and test set. In this case we deal with underfitting. Our error ist similar and very high. In this case we can try to use more complex model or we can try to find better features. We also can to try to regulize the model. 

In the Decsion Tree Regressor case the MAE on train set is 0. In this case our overfitted the data. On train data we got better results in comparison to Linear Regression and Lasso.

Train and test reults for Random Forest and XGBRegressor shows that the models are overfitted
In this case we need to regulize the model, use less complex model or reduce numer of features.

# Check which features are important

In [12]:
#features importance
clf.steps[0][1].get_support()
X_train.columns[clf.steps[0][1].get_support()]

Index(['sensor_1_std', 'sensor_2_std', 'sensor_3_std', 'sensor_4_std',
       'sensor_5_std', 'sensor_6_std', 'sensor_8_std', 'sensor_9_std',
       'sensor_1_max', 'sensor_2_max', 'sensor_3_max', 'sensor_5_max',
       'sensor_6_max', 'sensor_1_min', 'sensor_2_min', 'sensor_3_min',
       'sensor_5_min', 'sensor_6_min', 'sensor_9_min', 'sensor_1_kurtosis',
       'sensor_2_kurtosis', 'sensor_5_kurtosis', 'sensor_6_kurtosis',
       'sensor_9_kurtosis', 'sensor_10_kurtosis', 'sensor_1_0.8',
       'sensor_2_0.8', 'sensor_3_0.8', 'sensor_4_0.8', 'sensor_5_0.8',
       'sensor_6_0.8', 'sensor_7_0.8', 'sensor_8_0.8', 'sensor_10_0.8',
       'sensor_1_0.9', 'sensor_2_0.9', 'sensor_3_0.9', 'sensor_5_0.9',
       'sensor_6_0.9', 'sensor_7_0.9', 'sensor_8_0.9', 'sensor_9_0.9',
       'sensor_10_0.9', 'sensor_1_0.99', 'sensor_2_0.99', 'sensor_5_0.99',
       'sensor_6_0.99', 'sensor_7_0.99', 'sensor_8_0.99', 'sensor_9_0.99'],
      dtype='object')

Here we show 50% of all features which are most important

# Cross validation

In [None]:
#alternative for evaluation to train/test
X_df_cross,y_df_cross= prepare_for_modeling(df)
scores = cross_val_score(ran_for,X_df_cross,y_df_cross,scoring='neg_mean_absolute_error',cv=5)
print('Mean MAE for cross val score is {}'.format((-scores).mean()))
print('Std for MAE for cross val score is {}'.format((-scores).std()))

Alternative to train/test set

# Grid search for parameters

In [None]:
from sklearn.model_selection import GridSearchCV


df_x = pd.merge(time,df2, on=['segment_id'])
df_x = df_x.set_index('segment_id')

train,test = split_train_test_with_sklearn(df_x)

X_train,y_train = prepare_for_modeling(train)
X_test,y_test = prepare_for_modeling(test)

param_grid = [{
    'n_estimators':[150],
    'max_depth':[15],
    'min_child_weight': [1],
    'subsample': [1],
    'colsample_bytree': [1],
    'learning_rate':[0.10],
    # Other parameters
    'objective':['reg:squarederror']}]


xgb_cl = XGBRegressor()
grid_search = GridSearchCV(xgb_cl,param_grid,cv=5, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train,y_train)

final_model = grid_search.best_estimator_
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('MSA: {}'.format(MSA))

# Preapre data for submission


In [None]:
#read test file from kaggle
df1 = pd.read_csv('aggredated_test.csv')
df2 = pd.read_csv('test/quantiled_test.csv')
df1 = df1.rename(columns = {'Unnamed: 0':'segment_id'}) 
df2 = df2.rename(columns = {'Unnamed: 0':'segment_id'})# 'segment_id' object-> float64
df_test = pd.merge(df1,df2,on='segment_id')
df_test = df_test.set_index('segment_id')
df_test

#whole dataset for train and test from kaggle
X_df,y_df = prepare_for_modeling(df)

clf = Pipeline([
  #('feature_selection', SelectFromModel(RandomForestRegressor(),threshold=0.012)),
  ('classification', XGBRegressor())
])

clf.fit(X_df,y_df)
y_pred = clf.predict(df_test)

# results submission 
data = {'segment_id':df_test.index,
        'time_to_eruption':y_pred}
forsub = pd.DataFrame(data)
forsub = forsub.astype(int)
sub = pd.read_csv('sample_submission.csv')
sub = sub.drop('time_to_eruption',axis=1)
submission_file = pd.merge(sub,forsub, on='segment_id')
#submission_file.to_csv('submission_file_XGB_3.csv')

best submission was XGBoost with file only with quantile
Leaderboard position 196

# Trash -> First version for model training

In [None]:
# split train/test
train,test = split_train_test_with_sklearn(df)
print('Train shape is {}, test size is {}'.format(train.shape,test.shape))

X_train,y_train = prepare_for_modeling(train)
X_test,y_test = prepare_for_modeling(test)

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_pred = lin_reg.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('Linear Regression MSA: {}'.format(MSA))

lasso = Lasso(alpha = 0.5)
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('Lasso MSA: {}'.format(MSA))

dec_tree = DecisionTreeRegressor()
dec_tree.fit(X_train,y_train)
y_pred = dec_tree.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('Decision Tree MSA: {}'.format(MSA))

ran_for = RandomForestRegressor()
ran_for.fit(X_train,y_train)
y_pred = ran_for.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('Random Forest Regressor MSA: {}'.format(MSA))


xgb_meta = XGBRegressor(colsample_bytree=0.4,
                             gamma=0,
                            learning_rate=0.07,
                            max_depth=3,
                            min_child_weight=1.5,
                            n_estimators=1000,
                            reg_alpha=0.75,
                            reg_lambda=0.45,
                            subsample=0.6,
                            seed=2)
xgb_meta.fit(X_train,y_train)
y_pred = xgb_meta.predict(X_test)
MSA = mean_absolute_error(y_test,y_pred)
print('XGBRegressor MSA: {}'.format(MSA))