#  **Preparación del Entorno
# Importar las librerías necesarias (Pandas, NumPy, Scikit-learn, Matplotlib/Seaborn, SHAP).
# Cargar un dataset y un modelo entrenado (se proporcionará un ejemplo, o se usará el resultado de sesiones anteriores).

In [32]:
!python --version

Python 3.12.1


In [33]:
# Instalación de dependencias del nuevo codespace
!pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: '../requirements.txt'[0m[31m
[0m

## 0. Carga de Librerías

In [1]:
# Importación de Librerías
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Modelamiento
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # Modelos de Clasificación
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_squared_error # Modelos de Regresión

## 1. Carga de Datos

In [2]:
# Carga desde la carpeta data/raw/

# Ruta Absoluta en Linux o Mac
df = pd.read_csv("/workspaces/ml-bootcamp-labs/data/raw/Operational_events.csv")

In [3]:
df

Unnamed: 0,Well_ID,Date,Time,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
0,7.0,2023-12-15,14:42:07,3394.633937,131.203728,155.994520,174.250837,1326.028990,0.601115,7.080726,Normal,0.0,0.000000
1,2.0,2023-12-10,11:29:37,1003.115063,298.442312,617.481510,1834.959481,,0.023062,5.247747,Normal,0.0,0.000000
2,16.0,2023-09-28,14:50:43,3056.937754,218.482914,46.450413,1822.634556,421.681361,,9.488855,Blockage,1.0,59.737430
3,9.0,2023-12-12,20:01:19,1964.101864,236.652704,609.996658,2499.584735,425.374050,0.391061,1.822361,Normal,0.0,0.000000
4,6.0,2023-02-23,09:03:53,1125.253170,268.456955,449.754133,1185.450708,1404.656526,0.727272,3.265408,Normal,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2.0,2023-11-10,17:22:01,1224.855622,249.504127,850.852757,858.405127,1032.581456,0.314162,9.579711,Normal,0.0,0.000000
496,9.0,2023-09-16,13:54:10,1870.915599,224.245306,48.078968,1450.723757,219.041265,0.921144,9.760860,Normal,0.0,0.000000
497,8.0,2023-01-16,16:50:53,4580.306237,214.785827,350.777623,2498.081239,841.428711,0.911633,9.280142,Normal,0.0,0.000000
498,11.0,2023-04-13,08:41:49,3258.376325,150.464549,915.289940,797.056695,1364.560451,,6.565627,Blockage,1.0,18.799964


## 2. EDA (Medidas de Tendencia Central, Análisis de Nulos)

In [4]:
# Métodos info(), describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Well_ID               475 non-null    float64
 1   Date                  475 non-null    object 
 2   Time                  475 non-null    object 
 3   Pressure              475 non-null    float64
 4   Temperature           475 non-null    float64
 5   Flow_Rate             475 non-null    float64
 6   Pump_Speed            475 non-null    float64
 7   Gas_Oil_Ratio         475 non-null    float64
 8   Water_Cut             475 non-null    float64
 9   Vibration             475 non-null    float64
 10  Event_Type            500 non-null    object 
 11  Maintenance_Required  475 non-null    float64
 12  Downtime              475 non-null    float64
dtypes: float64(10), object(3)
memory usage: 50.9+ KB


In [5]:
df.describe()

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime
count,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0,475.0
mean,10.362105,2960.284279,202.865593,490.951223,1474.688338,826.067573,0.517296,4.767155,0.193684,9.716888
std,5.803338,1169.230972,56.146715,285.666948,855.630251,371.162601,0.293007,2.82583,0.395601,25.604104
min,1.0,1001.771126,100.270725,0.011635,3.08844,204.183743,0.001992,0.031409,0.0,0.0
25%,5.0,1920.023988,153.73048,253.45397,692.447336,496.80744,0.275245,2.22855,0.0,0.0
50%,10.0,3021.461026,204.310197,486.173654,1537.964259,825.174981,0.519032,4.80906,0.0,0.0
75%,15.0,3966.9374,251.558604,729.514778,2177.116863,1136.238405,0.756875,7.076513,0.0,0.0
max,20.0,4990.997556,299.361836,995.734452,2999.141412,1495.535353,0.997518,9.966969,1.0,152.279829


In [6]:
df.loc[ df.Date.isnull()  ]

Unnamed: 0,Well_ID,Date,Time,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Event_Type,Maintenance_Required,Downtime
17,9.0,,02:18:47,4386.644569,,404.508127,2663.310296,1306.206983,0.935635,7.853407,Normal,0.0,0.0
69,4.0,,23:36:07,3605.468844,254.249334,374.43537,206.764618,300.517385,0.104247,8.404396,Blockage,1.0,15.85989
70,5.0,,13:35:53,4349.963156,166.429283,311.444264,682.186863,990.262712,0.379306,7.442497,Normal,0.0,0.0
75,1.0,,22:38:02,2493.237265,154.148935,643.999543,1226.202513,233.002262,0.156153,7.159722,Normal,0.0,0.0
98,18.0,,15:12:27,1304.523438,270.241383,495.146527,1441.759732,970.13012,0.824681,3.478092,Normal,0.0,0.0
127,9.0,,08:24:47,4009.71105,118.096339,288.685596,327.328895,1045.665142,0.765191,9.256748,Normal,0.0,0.0
149,11.0,,01:59:22,1608.099413,112.269925,780.761586,1379.401271,275.612934,0.994866,0.577806,Normal,0.0,0.0
158,8.0,,09:37:46,1328.68667,,295.444777,2307.669444,1012.06264,0.38194,2.056873,Normal,0.0,0.0
163,15.0,,17:30:14,3023.410772,268.140606,,1626.711616,967.451998,0.508361,2.975485,Normal,0.0,0.0
194,15.0,,05:52:35,,148.378766,12.404036,1406.303896,591.644839,0.598357,2.972378,Normal,0.0,0.0


## 3. Transformaciones (Encoding, Imputación)

In [7]:
df.isnull().sum()

Well_ID                 25
Date                    25
Time                    25
Pressure                25
Temperature             25
Flow_Rate               25
Pump_Speed              25
Gas_Oil_Ratio           25
Water_Cut               25
Vibration               25
Event_Type               0
Maintenance_Required    25
Downtime                25
dtype: int64

In [8]:
# Método de Imputación "Simple"
# Cuando tenemos pocas variables podemos completar el valor faltante con una medida de tendencia central, como la media
df.Temperature.fillna(df.Temperature.mean())
df.Pressure.fillna(df.Pressure.mean())

0      3394.633937
1      1003.115063
2      3056.937754
3      1964.101864
4      1125.253170
          ...     
495    1224.855622
496    1870.915599
497    4580.306237
498    3258.376325
499    1760.911723
Name: Pressure, Length: 500, dtype: float64

In [9]:
from sklearn.preprocessing import LabelEncoder


In [10]:
label_encoder = LabelEncoder()

# Crear una columna de tipo numérica que va a asignar un número con base a Event_Type
df['Event_Type_n'] = label_encoder.fit_transform(df.Event_Type)

df[["Event_Type_n","Event_Type"]].value_counts()

Event_Type_n  Event_Type  
2             Normal          403
0             Blockage         41
1             Leak             31
3             Pump Failure     25
Name: count, dtype: int64

In [11]:
# Diccionario Manual para hacer la codificacion
manual_encode = {
    "Normal" : 0,
    "Blockage" : 1,
    "Leak" : 2,
    "Pump Failure": 3
}

df.Event_Type.map(manual_encode)


0      0
1      0
2      1
3      0
4      0
      ..
495    0
496    0
497    0
498    1
499    0
Name: Event_Type, Length: 500, dtype: int64

In [12]:
# Instrucción para descartar columnas que no sean numéricas
# Y aparte renombra la columna codificada por el nombre original
df_encoded = df.select_dtypes(exclude=['object']).rename(columns={"Event_Type_n":"Event_Type"})

In [13]:
# Carga de librería de SimpleImputer
from sklearn.impute import SimpleImputer

In [14]:
imputer = SimpleImputer(strategy="mean")

df_imputado = pd.DataFrame( imputer.fit_transform(df_encoded), columns= df_encoded.columns  )


In [15]:
"""
Event_Type_n  Event_Type  
2             Normal          403
0             Blockage         41
1             Leak             31
3             Pump Failure     25
Name: count, dtype: int64
"""

'\nEvent_Type_n  Event_Type  \n2             Normal          403\n0             Blockage         41\n1             Leak             31\n3             Pump Failure     25\nName: count, dtype: int64\n'

In [16]:
df_imputado.Event_Type.value_counts()

Event_Type
2.0    403
0.0     41
1.0     31
3.0     25
Name: count, dtype: int64

In [17]:
# Regla de balanceo de clases
(41+31+25)/3 * 1.1

35.56666666666667

In [18]:
majority_class_df_sample = df_imputado.loc[ df_imputado.Event_Type == 2 ].sample(35)

In [19]:
df_imputado.loc[ df_imputado.Event_Type == 2 ].describe()

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime,Event_Type
count,403.0,403.0,403.0,403.0,403.0,403.0,403.0,403.0,403.0,403.0,403.0
mean,10.42017,2942.621347,202.55766,488.745052,1501.895679,831.211214,0.523728,4.662769,0.009612,0.458116,2.0
std,5.704338,1157.05027,54.862889,277.016238,827.785615,356.558956,0.283777,2.784483,0.042116,2.062073,0.0
min,1.0,1001.771126,101.810832,0.011635,3.08844,204.183743,0.001992,0.031409,0.0,0.0,2.0
25%,6.0,1920.023988,158.692183,260.185968,780.150099,551.626912,0.29955,2.131812,0.0,0.0,2.0
50%,10.362105,2960.284279,202.865593,490.951223,1498.645006,826.067573,0.517296,4.767155,0.0,0.0,2.0
75%,15.0,3955.884754,248.920758,699.242711,2172.211041,1110.742217,0.752962,6.796059,0.0,0.0,2.0
max,20.0,4990.997556,299.361836,995.734452,2999.141412,1495.535353,0.997518,9.963343,0.193684,9.716888,2.0


In [20]:
majority_class_df_sample.describe()

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime,Event_Type
count,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0
mean,10.649263,2972.646902,200.513855,525.72711,1782.59157,793.415426,0.488453,3.818023,0.011068,0.277625,2.0
std,5.470531,1043.716384,48.648105,287.898759,672.379057,337.563355,0.256565,2.552185,0.045613,1.642454,0.0
min,1.0,1240.56937,102.102645,3.837146,251.174814,204.183743,0.058455,0.107153,0.0,0.0,2.0
25%,6.0,2106.788289,178.749617,287.668715,1381.937822,505.230863,0.297465,1.907691,0.0,0.0,2.0
50%,11.0,3056.908045,202.865593,490.951223,1897.6216,736.40298,0.517296,3.687072,0.0,0.0,2.0
75%,15.0,3776.265364,223.006868,764.929936,2305.649267,1037.05337,0.611502,5.334686,0.0,0.0,2.0
max,20.0,4741.188394,288.411644,995.734452,2971.515426,1396.898821,0.994082,9.508636,0.193684,9.716888,2.0


In [21]:
majority_class_df_sample

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime,Event_Type
306,9.0,3764.847238,220.607911,618.991565,2822.319899,1208.684993,0.127937,3.475071,0.0,0.0,2.0
314,10.362105,1919.306903,210.532454,372.173255,1985.001779,383.590606,0.570877,1.852704,0.0,0.0,2.0
345,11.0,2218.409423,152.903161,279.892652,833.157262,246.709237,0.517296,2.136981,0.193684,0.0,2.0
262,5.0,4741.188394,270.136052,981.905698,1474.688338,521.763095,0.162815,1.869144,0.0,0.0,2.0
429,17.0,1823.177508,248.313222,546.677265,2653.436183,920.191232,0.534559,1.95458,0.0,0.0,2.0
281,10.362105,2380.919815,182.840185,732.413865,2644.987447,357.944008,0.634639,5.821438,0.0,0.0,2.0
158,8.0,1328.68667,202.865593,295.444777,2307.669444,1012.06264,0.38194,2.056873,0.0,0.0,2.0
430,5.0,4560.946487,122.981269,3.837146,1506.024593,958.06617,0.578965,0.545119,0.0,0.0,2.0
51,16.0,2382.669133,279.357682,473.96164,2002.673216,424.015833,0.192289,4.767155,0.0,0.0,2.0
140,12.0,3371.247967,236.026386,230.829189,1370.575935,447.422322,0.517296,3.784764,0.0,0.0,2.0


In [22]:
minority_class_df_sample = df_imputado.loc[ df_imputado.Event_Type != 2]

In [23]:
minority_class_df_sample

Unnamed: 0,Well_ID,Pressure,Temperature,Flow_Rate,Pump_Speed,Gas_Oil_Ratio,Water_Cut,Vibration,Maintenance_Required,Downtime,Event_Type
2,16.0,3056.937754,218.482914,46.450413,1822.634556,421.681361,0.517296,9.488855,1.0,59.737430,0.0
8,15.0,1943.939679,151.213665,40.433590,2131.988669,344.158067,0.439337,2.017192,1.0,92.453312,3.0
9,4.0,2641.531692,251.110228,228.798165,230.939729,576.676889,0.161221,9.296977,1.0,31.869765,1.0
19,16.0,3170.160922,202.865593,636.332618,751.385456,966.832102,0.978893,4.867422,1.0,35.801239,0.0
30,10.0,4425.959365,231.738726,162.934427,211.706242,1035.145062,0.026511,5.857756,1.0,44.830347,0.0
...,...,...,...,...,...,...,...,...,...,...,...
477,6.0,4200.550331,148.331784,490.951223,950.712413,546.832914,0.505926,7.777279,1.0,145.660428,3.0
481,13.0,1727.054391,200.826568,322.809793,774.322930,803.181043,0.133284,9.277457,1.0,125.897364,3.0
485,7.0,2546.785800,264.228485,969.490899,1361.778409,859.099506,0.572172,9.323481,1.0,48.566562,3.0
489,10.0,3861.289242,153.628646,315.668481,375.753693,1052.162963,0.039797,5.976823,1.0,152.279829,3.0


In [24]:
df_balanced = pd.concat([ majority_class_df_sample, minority_class_df_sample])

## 4. Modelamiento

In [25]:
# Carga de librerías para modelos
from sklearn.tree import DecisionTreeClassifier

In [26]:
X = df_balanced.drop(columns=["Event_Type"])
y = df_balanced.Event_Type
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size= 0.8, random_state=23)

In [27]:
model_dtc = DecisionTreeClassifier()
model_dtc.fit(X_train,y_train)

## 5. Evaluación

In [28]:
predict_dtc = model_dtc.predict(X_test)

In [29]:
# Evaluación DTC
dtc_accuracy = accuracy_score(y_pred= predict_dtc, y_true= y_test) 
dtc_precision = precision_score(y_pred= predict_dtc, y_true= y_test,average='weighted')
dtc_recall = recall_score(y_pred= predict_dtc, y_true= y_test,average='weighted')
dtc_f1 = f1_score(y_pred= predict_dtc, y_true= y_test,average='weighted')
dtc_cmatrix = confusion_matrix(y_pred= predict_dtc, y_true= y_test)

In [30]:
print(dtc_accuracy)
print(dtc_precision)
print(dtc_recall)
print(dtc_f1)

0.6296296296296297
0.7376543209876544
0.6296296296296297
0.6197530864197531


In [31]:
dtc_cmatrix

array([[ 3,  3,  1,  4],
       [ 1,  2,  0,  0],
       [ 0,  1, 10,  0],
       [ 0,  0,  0,  2]])

## 2.  **Cálculo de la Importancia de las Características (30 minutos)**
# *   Utilizar métodos inherentes al modelo (ej: `feature_importances_` en Árboles de Decisión).
# *   Implementar permutation importance (Scikit-learn).
# *   Introducción a SHAP (SHapley Additive exPlanations) para una interpretación más completa (opcional, si el tiempo lo permite).