# Clusters para mejora de recomendaciones

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import timedelta
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
import pacmap

In [5]:
# read df full csv
df_features=pd.read_csv('../../data/processed data/df_feature_engineering_18December2022.csv', parse_dates=['Timestamp'], index_col='Timestamp')#[variables_interes]#["2020":"2021"]
df_features.dropna(inplace=True)
df_features.sort_index(inplace=True)
df_features.shape

(859335, 403)

In [7]:
tag_select=['min_water_3',# 1
 'min_solid percentage_10',# 2
 'HH TPH',# 3
 'max_delta LL charge cell_10',# 4
 'power',# 5
 'rms_delta LL charge cell_3',# 6
 'var_delta HH charge cell_10',# 7
 'max_covelin law_10',# 8
 'LL charge cell_(t-2)',# 9
 'min_granulometry_5',# 10
 'max_bornite law_10', # 11
 'min_charge cell_5',# 12
 'chalcocite law_(t-2)',# 13
 'max_sag power index_5',# 14
 'min_speed_3',# 15
 'var_bornite law_3',# 16
 'var_speed_3',# 17
 'min_pyrite law_10', # 18
 'crusher index_(t-5)', # 19
 'var_power_3',# 20
 'var_chalcocite law_3',# 21
 'var_bornite law_5',# 22
 'var_solid percentage_3',# 23
 'var_speed_10',# 24
 'ball work index_(t-1)',# 25
 'var_chalcocite law_5', # 26
 'var_water_3',# 27
 'chalcopyrite law_(t-5)',# 28
 'var_crusher index_10',# 29
 'var_chalcopyrite law_3',# 30
 'var_granulometry_3',# 31
 'var_delta HH charge cell_3'# 32
 ]

In [89]:
variables_interes=['pyrite law', 'chalcopyrite law',
       'chalcocite law', 'covelin law', 'crusher index', 'sag power index',
       'ball work index', 'bornite law','charge cell', 'speed','HH TPH', 'granulometry','Edad','loss of TPH',"solid percentage","HH charge cell","LL charge cell","TPH",
       #"recommendation base model carl","recommendation base model acn",
       "water",#"power"
       ]

In [90]:
tag_cluster=['min_water_3',# 1
 'min_solid percentage_10',# 2
 #'HH TPH',# 3
 #'max_delta LL charge cell_10',# 4
 'power',# 5
 #'rms_delta LL charge cell_3',# 6
 #'var_delta HH charge cell_10',# 7
 'max_covelin law_10',# 8
 #'LL charge cell_(t-2)',# 9
 'min_granulometry_5',# 10
 'max_bornite law_10', # 11
 #'min_charge cell_5',# 12
 'chalcocite law_(t-2)',# 13
 'max_sag power index_5',# 14
 'min_speed_3',# 15
 'var_bornite law_3',# 16
 'var_speed_3',# 17
 'min_pyrite law_10', # 18
 'crusher index_(t-5)', # 19
 'var_power_3',# 20
 'var_chalcocite law_3',# 21
 'var_bornite law_5',# 22
 'var_solid percentage_3',# 23
 'var_speed_10',# 24
 'ball work index_(t-1)',# 25
 'var_chalcocite law_5', # 26
 'var_water_3',# 27
 'chalcopyrite law_(t-5)',# 28
 'var_crusher index_10',# 29
 'var_chalcopyrite law_3',# 30
 'var_granulometry_3',# 31
 #'var_delta HH charge cell_3'# 32
 ]

# clusters

In [147]:
df_sample=df_features[tag_cluster+variables_interes].copy().dropna().sample(5000)

In [148]:
# Cols
MinMax=MinMaxScaler()

# Selección de features a estandarizar 
preprocessing_transformer = ColumnTransformer(transformers=[('MinMax', MinMax, tag_cluster)])

# Creación del pipeline
pipeline = Pipeline([('Preprocessing', preprocessing_transformer),('pacmap', pacmap.PaCMAP(n_components=2, n_neighbors=10,random_state=0))])

# Ejecución del pipeline
features_pacmap = pipeline.fit_transform(df_sample)
features_pacmap=pd.DataFrame(features_pacmap,columns=['x_pacmap','y_pacmap'])
features_pacmap[tag_cluster+variables_interes]=df_sample.values

In [149]:
# Plot line
fig = px.scatter(features_pacmap, x='x_pacmap', y='y_pacmap', #log_x=True,

                 hover_data=tag_cluster,
         )


fig.update_layout(
    title="Visualización de baja dimensionalidad",
    xaxis_title='x_pacmap',
    yaxis_title='y_pacmap',height=500, width=1200
)      

fig.show()

In [150]:
# Iteración de inercias sobre cada número de clusters ejecutado
inertias = [
    [i, Pipeline([('Preprocessing', preprocessing_transformer),('k-means', KMeans(n_clusters=i, random_state=0))]).fit(df_sample)[1].inertia_]
    for i in range(2, 10)]
    
# Se guardan inercias y número de clusters en un data frame
inertias = pd.DataFrame(inertias, columns=["n° clusters", "inertia"])

# Visualización del método del codo
fig=px.line(inertias, x="n° clusters", y="inertia", title="Método del Codo con K-Means")
fig.update_layout(height=500, width=1200)
fig.show()

In [151]:
# Pipeline entrenamiento k-means con 6 clusters
pipeline1=Pipeline([('Preprocessing', preprocessing_transformer),('k-means', KMeans(n_clusters=7, random_state=0))])

# Entrenamiento de pipeline
features_pacmap["Labels"]=pipeline1.fit_predict(df_sample)
features_pacmap["Labels"]=features_pacmap["Labels"].astype(str)
features_pacmap.sort_values(by=["Labels"],inplace=True)

# Plot line
fig = px.scatter(features_pacmap, x='x_pacmap', y='y_pacmap', #log_x=True,

                 hover_data=tag_cluster,
                 color="Labels"
             )


fig.update_layout(
    title="Visualización de baja dimensionalidad",
    xaxis_title='x_pacmap',
    yaxis_title='y_pacmap',height=500, width=1200
)      

fig.show()

In [155]:
fig = px.histogram(features_pacmap, x="HH charge cell", histnorm='probability density',color="Labels")
fig.update_layout(
    title="Distribución de clusters",
  height=500, width=1200
)  
fig.show()

In [156]:
features_pacmap[["HH charge cell","Labels"]].groupby("Labels").agg(
    
    [('min',  lambda x: np.quantile(x, 0.05)), 
    ('max', lambda x: np.quantile(x, 0.9)), 
    ('mean', lambda x: np.median(x))
    
    ]

    )

Unnamed: 0_level_0,HH charge cell,HH charge cell,HH charge cell
Unnamed: 0_level_1,min,max,mean
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,720.0,930.0,830.0
1,710.0,910.0,830.0
2,730.0,930.0,830.0
3,710.0,920.0,840.0
4,730.0,900.0,840.0
5,710.0,910.0,810.0
6,705.0,905.0,810.0
