# Sesión 06 - Manejo de Modelos de Clasificación para entornos operativos

*Objetivo*

- Aplicar algoritmos de clasificación para identificar eventos operacionales críticos en la industria petrolera.
- Evaluar el rendimiento de diferentes clasificadores.
- Interpretar los resultados y la matriz de confusión.
- Analizar la eficiencia en pozos productores.

In [102]:
!python --version


Python 3.12.1


## 0. Carga de Librerías

In [103]:
# Instalación de dependencias del nuevo codespace
!pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [104]:
# Importación de Librerías
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_squared_error


## 1. Carga de Datos

In [105]:
# Carga desde la carpeta data/raw/

df = pd.read_csv("/workspaces/ml-bootcamp-labs/data/raw/Operational_events.csv")

## 2. EDA (Medidas de Tendencia Central, Análisis de Nulos)

In [106]:
# Métodos info(), describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Well_ID               475 non-null    float64
 1   Date                  475 non-null    object 
 2   Time                  475 non-null    object 
 3   Pressure              475 non-null    float64
 4   Temperature           475 non-null    float64
 5   Flow_Rate             475 non-null    float64
 6   Pump_Speed            475 non-null    float64
 7   Gas_Oil_Ratio         475 non-null    float64
 8   Water_Cut             475 non-null    float64
 9   Vibration             475 non-null    float64
 10  Event_Type            500 non-null    object 
 11  Maintenance_Required  475 non-null    float64
 12  Downtime              475 non-null    float64
dtypes: float64(10), object(3)
memory usage: 50.9+ KB


In [107]:
df.describe()


Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime
count,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0
mean,10.362105,2960.284279,202.865593,490.951223,1474.688338,826.067573,0.517296,4.767155,0.193684,9.716888
std,5.803338,1169.230972,56.146715,285.666948,855.630251,371.162601,0.293007,2.82583,0.395601,25.604104
min,1.0,1001.771126,100.270725,0.011635,3.08844,204.183743,0.001992,0.031409,0.0,0.0
25%,5.0,1920.023988,153.73048,253.45397,692.447336,496.80744,0.275245,2.22855,0.0,0.0
50%,10.0,3021.461026,204.310197,486.173654,1537.964259,825.174981,0.519032,4.80906,0.0,0.0
75%,15.0,3966.9374,251.558604,729.514778,2177.116863,1136.238405,0.756875,7.076513,0.0,0.0
max,20.0,4990.997556,299.361836,995.734452,2999.141412,1495.535353,0.997518,9.966969,1.0,152.279829


In [108]:
df.loc[ df.Pressure.isnull()  ]

Unnamed: 0,Well_ID,Date,Time,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
5,8.0,2023-08-01,15:44:17,,217.350233,965.255307,1821.102743,558.798937,0.296274,1.652669,Normal,0.0,0.0
21,6.0,2023-04-09,08:36:23,,210.552993,572.292469,2940.994751,297.950133,0.305697,1.90911,Normal,0.0,0.0
36,4.0,2023-02-22,22:15:56,,286.185821,974.248209,2987.793726,272.632501,0.737036,5.459158,Normal,0.0,0.0
85,13.0,2023-07-20,01:52:08,,245.637751,495.491316,2065.207189,765.27554,0.246402,8.191023,Normal,0.0,0.0
128,2.0,2023-03-23,12:40:22,,263.566843,120.209055,1633.467294,207.486259,0.324586,3.664615,Normal,0.0,0.0
134,17.0,2023-10-25,23:35:19,,261.19851,550.226542,130.237599,1023.096788,0.951403,6.016118,Leak,1.0,42.905084
184,3.0,2023-12-31,02:26:49,,201.410221,,100.150805,672.801062,0.732635,8.203775,Normal,0.0,0.0
194,15.0,,05:52:35,,148.378766,12.404036,1406.303896,591.644839,0.598357,2.972378,Normal,0.0,0.0
201,,2023-12-07,20:48:41,,286.350744,397.015041,261.2783,1002.186471,0.113838,3.452231,Normal,0.0,0.0
215,3.0,2023-09-09,17:39:21,,,201.312108,1185.41323,251.297393,0.475546,5.430246,Normal,0.0,0.0


In [109]:
df.isnull().sum()

Well_ID                 25
Date                    25
Time                    25
Pressure                25
Temperature             25
Flow_Rate               25
Pump_Speed              25
Gas_Oil_Ratio           25
Water_Cut               25
Vibration               25
Event_Type               0
Maintenance_Required    25
Downtime                25
dtype: int64

## 3. Transformaciones (Imputación, Encoding)

In [110]:
df= df.drop(columns=["Event_Type_n"])

KeyError: "['Event_Type_n'] not found in axis"

In [None]:
df

Unnamed: 0,Well_ID,Date,Time,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
0,7.0,2023-12-15,14:42:07,3394.633937,131.203728,155.994520,174.250837,1326.028990,0.601115,7.080726,2,0.0,0.000000
1,2.0,2023-12-10,11:29:37,1003.115063,298.442312,617.481510,1834.959481,,0.023062,5.247747,2,0.0,0.000000
2,16.0,2023-09-28,14:50:43,3056.937754,218.482914,46.450413,1822.634556,421.681361,,9.488855,0,1.0,59.737430
3,9.0,2023-12-12,20:01:19,1964.101864,236.652704,609.996658,2499.584735,425.374050,0.391061,1.822361,2,0.0,0.000000
4,6.0,2023-02-23,09:03:53,1125.253170,268.456955,449.754133,1185.450708,1404.656526,0.727272,3.265408,2,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2.0,2023-11-10,17:22:01,1224.855622,249.504127,850.852757,858.405127,1032.581456,0.314162,9.579711,2,0.0,0.000000
496,9.0,2023-09-16,13:54:10,1870.915599,224.245306,48.078968,1450.723757,219.041265,0.921144,9.760860,2,0.0,0.000000
497,8.0,2023-01-16,16:50:53,4580.306237,214.785827,350.777623,2498.081239,841.428711,0.911633,9.280142,2,0.0,0.000000
498,11.0,2023-04-13,08:41:49,3258.376325,150.464549,915.289940,797.056695,1364.560451,,6.565627,0,1.0,18.799964


In [None]:
# Cambiar los valores nulos o faltantes para lo que es necesario completar

df.isnull().sum()

Well_ID                 25
Date                    25
Time                    25
Pressure                25
Temperature             25
Flow_Rate               25
Pump_Speed              25
Gas_Oil_Ratio           25
Water_Cut               25
Vibration               25
Event_Type               0
Maintenance_Required    25
Downtime                25
dtype: int64

In [None]:
# Los valores deben ser a corde del atributo que se esta midiendo

df.Pressure.describe()


count     475.000000
mean     2960.284279
std      1169.230972
min      1001.771126
25%      1920.023988
50%      3021.461026
75%      3966.937400
max      4990.997556
Name: Pressure, dtype: float64

In [None]:
df.Pressure.to_frame().boxplot()


<Axes: >

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

df['Event_Type'] = label_encoder.fit_transform(df.Event_Type)

# df[["Event_Type_n","Event_Type"]].value_counts()

In [None]:
# Instruccion para descartar columnas que no sean numericas

df.select_dtypes(exclude=['object']).rename(columns={"Event_Type_n":"Event_Type"})

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
0,7.0,3394.633937,131.203728,155.994520,174.250837,1326.028990,0.601115,7.080726,2,0.0,0.000000
1,2.0,1003.115063,298.442312,617.481510,1834.959481,,0.023062,5.247747,2,0.0,0.000000
2,16.0,3056.937754,218.482914,46.450413,1822.634556,421.681361,,9.488855,0,1.0,59.737430
3,9.0,1964.101864,236.652704,609.996658,2499.584735,425.374050,0.391061,1.822361,2,0.0,0.000000
4,6.0,1125.253170,268.456955,449.754133,1185.450708,1404.656526,0.727272,3.265408,2,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
495,2.0,1224.855622,249.504127,850.852757,858.405127,1032.581456,0.314162,9.579711,2,0.0,0.000000
496,9.0,1870.915599,224.245306,48.078968,1450.723757,219.041265,0.921144,9.760860,2,0.0,0.000000
497,8.0,4580.306237,214.785827,350.777623,2498.081239,841.428711,0.911633,9.280142,2,0.0,0.000000
498,11.0,3258.376325,150.464549,915.289940,797.056695,1364.560451,,6.565627,0,1.0,18.799964


In [None]:
# Y a parte renombra la columna codificada por el nombre original

df_encoded = df.select_dtypes(exclude=['object']).rename(columns={"Event_Type_n":"Event_Type"})

In [None]:
# Carga de libreria de simpleImputer
from sklearn.impute import SimpleImputer


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Well_ID               475 non-null    float64
 1   Date                  475 non-null    object 
 2   Time                  475 non-null    object 
 3   Pressure              475 non-null    float64
 4   Temperature           475 non-null    float64
 5   Flow_Rate             475 non-null    float64
 6   Pump_Speed            475 non-null    float64
 7   Gas_Oil_Ratio         475 non-null    float64
 8   Water_Cut             475 non-null    float64
 9   Vibration             475 non-null    float64
 10  Event_Type            500 non-null    int64  
 11  Maintenance_Required  475 non-null    float64
 12  Downtime              475 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 50.9+ KB


In [None]:
imputer = SimpleImputer(strategy="mean")

df_imputado = pd.DataFrame( imputer.fit_transform(df_encoded), columns = df_encoded.columns)

In [None]:
df_imputado

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
0,7.0,3394.633937,131.203728,155.994520,174.250837,1326.028990,0.601115,7.080726,2.0,0.0,0.000000
1,2.0,1003.115063,298.442312,617.481510,1834.959481,826.067573,0.023062,5.247747,2.0,0.0,0.000000
2,16.0,3056.937754,218.482914,46.450413,1822.634556,421.681361,0.517296,9.488855,0.0,1.0,59.737430
3,9.0,1964.101864,236.652704,609.996658,2499.584735,425.374050,0.391061,1.822361,2.0,0.0,0.000000
4,6.0,1125.253170,268.456955,449.754133,1185.450708,1404.656526,0.727272,3.265408,2.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
495,2.0,1224.855622,249.504127,850.852757,858.405127,1032.581456,0.314162,9.579711,2.0,0.0,0.000000
496,9.0,1870.915599,224.245306,48.078968,1450.723757,219.041265,0.921144,9.760860,2.0,0.0,0.000000
497,8.0,4580.306237,214.785827,350.777623,2498.081239,841.428711,0.911633,9.280142,2.0,0.0,0.000000
498,11.0,3258.376325,150.464549,915.289940,797.056695,1364.560451,0.517296,6.565627,0.0,1.0,18.799964


## 4. Modelamiento

In [None]:
# Carga de librerías para modelos

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
X = df_imputado.drop(columns=["Event_Type"])
y = df_imputado.Event_Type
X_train, X_test, y_train,y_test = train_test_split(X,y, train_size= 0.8, random_state=23)

In [None]:
y_train

61     0.0
13     2.0
183    2.0
38     2.0
479    2.0
      ... 
438    0.0
457    2.0
40     0.0
230    2.0
83     2.0
Name: Event_Type, Length: 400, dtype: float64

In [None]:
model_dtc = DecisionTreeClassifier()
model_dtc.fit(X_train,y_train)

In [None]:
model_knc = KNeighborsClassifier()
model_knc.fit(X_train,y_train)

In [None]:
model_svc = SVC()
model_svc.fit(X_train,y_train)

## 5. Evaluación

In [113]:
model_dtc.predict(X_test)
model_knc.predict(X_test)
model_svc.predict(X_test)

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [117]:
predict_dtc = model_dtc.predict(X_test)

In [121]:
dtc_accuracy = accuracy_score(y_pred= predict_dtc, y_true=y_test)
dtc_precision = precision_score(y_pred=predict_dtc,y_true=y_test,average='weighted')
dtc_recall = recall_score(y_pred=predict_dtc, y_true=y_test,average='weighted')
dtc_f1=f1_score(y_pred=predict_dtc,y_true=y_test,average='weighted')
dtc_cmatrix= confusion_matrix(y_pred=predict_dtc,y_true=y_test)

In [122]:
print(dtc_accuracy)
print(dtc_precision)
print(dtc_recall)
print(dtc_f1)


0.92
0.9229999999999999
0.92
0.9208888888888889


In [123]:
dtc_cmatrix

array([[ 1,  2,  0,  1],
       [ 3,  3,  0,  0],
       [ 0,  0, 87,  0],
       [ 1,  1,  0,  1]])

In [128]:
predict_knc = model_knc.predict(X_test)

In [125]:
knc_accuracy = accuracy_score(y_pred= predict_knc, y_true=y_test)
knc_precision = precision_score(y_pred=predict_knc,y_true=y_test,average='weighted')
knc_recall = recall_score(y_pred=predict_knc, y_true=y_test,average='weighted')
knc_f1=f1_score(y_pred=predict_knc,y_true=y_test,average='weighted')
knc_cmatrix= confusion_matrix(y_pred=predict_knc,y_true=y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [126]:
print(knc_accuracy)
print(knc_precision)
print(knc_recall)
print(knc_f1)

0.84
0.7534020618556702
0.84
0.7943478260869564


In [127]:
knc_cmatrix

array([[ 0,  0,  4,  0],
       [ 0,  0,  6,  0],
       [ 2,  1, 84,  0],
       [ 0,  0,  3,  0]])

In [129]:
predict_svc = model_svc.predict(X_test)

In [130]:
svc_accuracy = accuracy_score(y_pred= predict_svc, y_true=y_test)
svc_precision = precision_score(y_pred=predict_svc,y_true=y_test,average='weighted')
svc_recall = recall_score(y_pred=predict_svc, y_true=y_test,average='weighted')
svc_f1=f1_score(y_pred=predict_svc,y_true=y_test,average='weighted')
svc_cmatrix= confusion_matrix(y_pred=predict_svc,y_true=y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [131]:
print(svc_accuracy)
print(svc_precision)
print(svc_recall)
print(svc_f1)

0.87
0.7569
0.87
0.8095187165775402


In [132]:
svc_cmatrix

array([[ 0,  0,  4,  0],
       [ 0,  0,  6,  0],
       [ 0,  0, 87,  0],
       [ 0,  0,  3,  0]])