# model TPH

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import timedelta
import re
from dateutil.parser import parse
import string
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_percentage_error
import missingno as msno
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
import scipy
import researchpy as rp
from matplotlib.offsetbox import AnchoredText
import pacmap
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split 
# Import Halving Grid Search
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
import xgboost as xgb
from scipy.stats import kurtosis,skew
from numpy import mean,sqrt,square
from scipy.fftpack import fft, fftfreq
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import PredefinedSplit
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 100

In [2]:
# Función que filtra tags por alta correlación
def corrkill(dataframe, features, corr_cut=0.8):

    df = dataframe[features]
    dfcorr = pd.DataFrame(np.triu(df.corr()), columns=df.columns, index=df.columns)
    dfcorr = dfcorr.stack().reset_index()
    dfcorr.columns = ['Feat1','Feat2','Val']
    dfcorr = dfcorr[~dfcorr['Val'].isin([0,1])]
    dfcorr = dfcorr[dfcorr['Val'].abs()>corr_cut]
    dfcorr["Val"]=dfcorr["Val"].abs()
    
    return list(dfcorr['Feat2']), dfcorr.sort_values(by="Val",ascending=False)

In [3]:
tag_select=['HH TPH',
 'min_speed_3',
 'min_solid percentage_10',
 'rms_bornite law_3',
 'sag power index_(t-2)',
 'min_charge cell_5',
 'rms_chalcopyrite law_3',
 'min_ball work index_5',
 'max_chalcocite law_3',
 'rms_covelin law_3',
 'Edad',
 'max_granulometry_10',
 'max_crusher index_3',
 'var_speed_10',
 'var_ball work index_3',
 'min_pyrite law_3',
 'var_charge cell_10',
 'var_solid percentage_3',
 'var_pyrite law_5',
 'var_pyrite law_3',
 'var_chalcopyrite law_3',
 'var_solid percentage_5']

In [4]:
list_r2=[]
lis_rmse=[]
list_std=[]
list_error_mean=[]
list_desfases=[]
for desfase in range(16):
    print(desfase)
    # read df full csv
    df_features=pd.read_csv('../../data/consolidated data/df_features_10November2022.csv', parse_dates=['Timestamp'], index_col='Timestamp')#["2020":"2021"]
    df_features["TPH"]=df_features["TPH"].shift(-desfase)
    df_features["loss of TPH"]=df_features["loss of TPH"].shift(-desfase)
    df_features.dropna(inplace=True)
    df_features.sort_index(inplace=True)
    # Verificando si hay duplicados
    u=[i for i in df_features.columns if (not re.match(".*water",i)) and (not re.match(".*power",i)) ]
    y=[i for i in df_features.columns if  re.match(".*sag power index",i)] 
    u=y+u
    df_features=df_features[u]
    # Entrenamiento
    entrenamiento=pd.concat([df_features["2020-01":"2020-03"],df_features["2020-05"],df_features["2020-07":"2020-12"],df_features["2021-01":"2021-08"],df_features["2021-10"],df_features["2021-12"]])
    # Validación
    validacion=pd.concat([df_features["2020-06"],df_features["2021-11"]])
    # Test
    test=pd.concat([df_features["2020-04"],df_features["2021-09"],df_features["2022"]])
    tags=df_features.columns.to_list()
    tags.remove("loss of TPH")
    #tags.remove("TPH")
    # Sobremuestreo: Aumentar numero de datos de la clase minoritaria
    # ROS(Duplica clases):
    ros=RandomOverSampler(random_state=0) # Random_state=0

    # ROS:
    Xtrain, Ytrain=ros.fit_resample(entrenamiento[tags],entrenamiento["loss of TPH"])

    # Ytrain reg
    Ytrain=Xtrain["TPH"]
    Xtrain=Xtrain.drop(columns=["TPH"])

    # Yval reg
    Yval=validacion["TPH"]
    Xval=validacion.drop(columns=["TPH"])

    # Ytest reg
    Ytest=test["TPH"]
    Xtest=test.drop(columns=["TPH"])

    # Preprocesamiento de datos

    preprocessing_transformer = ColumnTransformer(
        transformers=[
            ('MinMax', MinMaxScaler(),
            tag_select
            ),
        ])

    pipe = Pipeline(
        [
            ("preprocesamiento", preprocessing_transformer), 
        ("clf", xgb.XGBRegressor(seed=1,eval_metric='rmse',gamma= 12.525, learning_rate= 0.112, n_estimators= 100
        
        ))
        ]
    )
    pipe.fit(Xtrain, Ytrain)

    Ypred= pipe.predict(Xtest)

    list_r2.append(r2_score(Ytest, Ypred))
    lis_rmse.append(mean_squared_error(Ytest, Ypred, squared=False))
    list_std.append((Ytest-Ypred).abs().std())
    list_error_mean.append((Ytest-Ypred).abs().mean())  
    list_desfases.append(desfase)  
    del df_features,entrenamiento,validacion,test,Xtrain,Xval,Xtest,Ytrain,Yval,Ytest
    print(desfase)

0
0
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15


In [6]:
titulos=tuple(["R2","RMSE","Mean error","STD"])
fig = make_subplots(
    rows=4, cols=1,
    subplot_titles=titulos,
    #subplot_titles=tuple("TPH y HH TPH,"),
     #shared_xaxes=True
    )


fig.add_trace(go.Scatter(x=list_desfases, y=list_r2, ##FF6511
                    mode='lines',
                    name="R2",line=dict(width=3, ),legendgroup = '1'),row=1, col=1)  

fig.add_trace(go.Scatter(x=list_desfases, y=lis_rmse, ##FF6511
                    mode='lines',
                    name="RMSE",line=dict(width=3, ),legendgroup = '1'),row=2, col=1)  

fig.add_trace(go.Scatter(x=list_desfases, y=list_error_mean, ##FF6511
                    mode='lines',
                    name="Mean error",line=dict(width=3, ),legendgroup = '1'),row=3, col=1)  

#fig.add_trace(go.Scatter(x=list_desfases, y=data["TPH predict"], ##FF6511
#                    mode='lines',
#                    name="regressor model TPH",line=dict(width=1,color="black" ),legendgroup = '1'),row=1, col=1)  

fig.add_trace(go.Scatter(x=list_desfases, y=list_std, 
                    mode='lines',
                    name="STD",line=dict(width=3, ),legendgroup = '1'),row=4, col=1)  




#for i in range(1,6):
#   for j in range(1,4):
#        fig.update_xaxes(tickformat="%H:%M",row=i, col=j)


fig.update_layout(height=1000, width=1500, title_text="Metrics")
fig.update_layout(hovermode="x unified")
#

fig.show()