In [66]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [67]:
file_path = "yr2010_present_clean.csv"
oilspill_df = pd.read_csv(file_path, index_col = 'Unnamed: 0')
oilspill_df

Unnamed: 0,REPORT_NUMBER,NAME,DATE_OF_INCIDENT,LOCATION_LATITUDE,LOCATION_LONGITUDE,COMMODITY_RELEASED_TYPE,TOTAL_BBLS_RELEASED,FATALITY_IND,INJURY_IND,SHUTDOWN_DUE_ACCIDENT_IND,...,RELEASE_TYPE,TOTAL_EST_COST,ACCIDENT_PRESSURE,SCADA_DETECTION_IND,CPM_DETECTION_IND,WATER_CONTAM_IND,CAUSE,CAUSE_DETAILS,YEAR_OF_INCIDENT,AGE_OF_FACILITY
0,20210333,MARATHON PIPE LINE LLC,2021-11-11,40.676195,-81.347826,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,0.50,NO,NO,YES,...,LEAK,2253.0,PRESSURE DID NOT EXCEED MOP,NO,NO,NO,EQUIPMENT FAILURE,"FAILURE OF EQUIPMENT BODY (EXCEPT PUMP), TANK ...",2021,4.0
1,20210335,"CASPER CRUDE TO RAIL, LLC",2021-11-10,42.944716,-106.442801,CRUDE OIL,0.36,NO,NO,NO,...,LEAK,11260.0,PRESSURE DID NOT EXCEED MOP,NO,,NO,EQUIPMENT FAILURE,PUMP OR PUMP-RELATED EQUIPMENT,2021,2.0
2,20210331,WEST TEXAS GULF PIPELINE CO,2021-11-01,31.777937,-96.400398,CRUDE OIL,2.00,NO,NO,NO,...,LEAK,40168.0,PRESSURE DID NOT EXCEED MOP,NO,NO,NO,CORROSION FAILURE,INTERNAL CORROSION,2021,3.0
3,20210334,"FLINT HILLS RESOURCES, LC",2021-11-01,29.834136,-94.892281,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,2.20,NO,NO,YES,...,LEAK,505.0,PRESSURE DID NOT EXCEED MOP,NO,NO,NO,EQUIPMENT FAILURE,OTHER EQUIPMENT FAILURE,2021,3.0
4,20210330,CR PERMIAN PROCESSING LLC,2021-10-27,31.510493,-103.503215,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,10244.18,NO,NO,YES,...,OTHER,1035359.0,PRESSURE DID NOT EXCEED MOP,,,YES,CORROSION FAILURE,EXTERNAL CORROSION,2021,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,20100030,"PLAINS PIPELINE, L.P.",2010-01-07,33.582660,-96.648810,CRUDE OIL,700.00,NO,NO,NO,...,MECHANICAL PUNCTURE,29650.0,PRESSURE DID NOT EXCEED MOP,NO,NO,NO,EXCAVATION DAMAGE,EXCAVATION DAMAGE BY THIRD PARTY,2010,46.0
4636,20100260,"ENBRIDGE ENERGY, LIMITED PARTNERSHIP",2010-01-06,46.689300,-92.061200,CRUDE OIL,0.48,NO,NO,,...,LEAK,11540.0,,,,NO,NATURAL FORCE DAMAGE,TEMPERATURE,2010,
4637,20100038,"PETROLOGISTICS OLEFINS, LLC",2010-01-05,30.182400,-93.352400,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,2.00,NO,NO,,...,LEAK,200.0,PRESSURE DID NOT EXCEED MOP,NO,NO,NO,EQUIPMENT FAILURE,DEFECTIVE OR LOOSE TUBING OR FITTING,2010,
4638,20100254,PORTLAND PIPELINE CORP,2010-01-04,43.940280,-70.493360,CRUDE OIL,0.12,NO,NO,,...,OTHER,4008.0,,,,NO,EQUIPMENT FAILURE,PUMP OR PUMP-RELATED EQUIPMENT,2010,


In [68]:
oilspill_df['CAUSE'].value_counts()

EQUIPMENT FAILURE                   2124
CORROSION FAILURE                    944
INCORRECT OPERATION                  674
MATERIAL FAILURE OF PIPE OR WELD     310
NATURAL FORCE DAMAGE                 217
EXCAVATION DAMAGE                    165
OTHER INCIDENT CAUSE                 114
OTHER OUTSIDE FORCE DAMAGE            91
Name: CAUSE, dtype: int64

In [69]:
age = oilspill_df['CAUSE'].loc[oilspill_df['AGE_OF_FACILITY']==0]
age.value_counts()

EQUIPMENT FAILURE                   198
INCORRECT OPERATION                 122
NATURAL FORCE DAMAGE                 11
OTHER INCIDENT CAUSE                  8
OTHER OUTSIDE FORCE DAMAGE            6
MATERIAL FAILURE OF PIPE OR WELD      5
CORROSION FAILURE                     3
EXCAVATION DAMAGE                     2
Name: CAUSE, dtype: int64

In [70]:
oilspill_df.isnull().sum()

REPORT_NUMBER                   0
NAME                            0
DATE_OF_INCIDENT                0
LOCATION_LATITUDE               0
LOCATION_LONGITUDE              0
COMMODITY_RELEASED_TYPE         0
TOTAL_BBLS_RELEASED             0
FATALITY_IND                    0
INJURY_IND                      0
SHUTDOWN_DUE_ACCIDENT_IND     209
IGNITE_IND                      0
EXPLODE_IND                     0
ON_OFF_SHORE                    0
INCIDENT_AREA_TYPE             34
DEPTH_OF_COVER               3389
ITEM_INVOLVED                   2
PIPE_DIAMETER                3530
INSTALLATION_YEAR            1579
RELEASE_TYPE                    1
TOTAL_EST_COST                  1
ACCIDENT_PRESSURE             738
SCADA_DETECTION_IND          1269
CPM_DETECTION_IND            2740
WATER_CONTAM_IND                2
CAUSE                           1
CAUSE_DETAILS                   0
YEAR_OF_INCIDENT                0
AGE_OF_FACILITY              1579
dtype: int64

In [71]:
# Remove columns that will not be used for the model
oilspill_df = oilspill_df[['CAUSE', 'AGE_OF_FACILITY', 'ACCIDENT_PRESSURE']]
oilspill_df.head()

Unnamed: 0,CAUSE,AGE_OF_FACILITY,ACCIDENT_PRESSURE
0,EQUIPMENT FAILURE,4.0,PRESSURE DID NOT EXCEED MOP
1,EQUIPMENT FAILURE,2.0,PRESSURE DID NOT EXCEED MOP
2,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP
3,EQUIPMENT FAILURE,3.0,PRESSURE DID NOT EXCEED MOP
4,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP


In [72]:
oilspill_df.isnull().sum()

CAUSE                   1
AGE_OF_FACILITY      1579
ACCIDENT_PRESSURE     738
dtype: int64

In [73]:
oilspill_df = oilspill_df.dropna()
oilspill_df

Unnamed: 0,CAUSE,AGE_OF_FACILITY,ACCIDENT_PRESSURE
0,EQUIPMENT FAILURE,4.0,PRESSURE DID NOT EXCEED MOP
1,EQUIPMENT FAILURE,2.0,PRESSURE DID NOT EXCEED MOP
2,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP
3,EQUIPMENT FAILURE,3.0,PRESSURE DID NOT EXCEED MOP
4,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP
...,...,...,...
4628,MATERIAL FAILURE OF PIPE OR WELD,41.0,PRESSURE DID NOT EXCEED MOP
4633,MATERIAL FAILURE OF PIPE OR WELD,29.0,PRESSURE DID NOT EXCEED MOP
4634,MATERIAL FAILURE OF PIPE OR WELD,54.0,PRESSURE DID NOT EXCEED MOP
4635,EXCAVATION DAMAGE,46.0,PRESSURE DID NOT EXCEED MOP


In [74]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(oilspill_df)
X.head(10)

Unnamed: 0,AGE_OF_FACILITY,CAUSE_CORROSION FAILURE,CAUSE_EQUIPMENT FAILURE,CAUSE_EXCAVATION DAMAGE,CAUSE_INCORRECT OPERATION,CAUSE_MATERIAL FAILURE OF PIPE OR WELD,CAUSE_NATURAL FORCE DAMAGE,CAUSE_OTHER INCIDENT CAUSE,CAUSE_OTHER OUTSIDE FORCE DAMAGE,ACCIDENT_PRESSURE_PRESSURE DID NOT EXCEED MOP,ACCIDENT_PRESSURE_PRESSURE EXCEEDED 110% OF MOP,"ACCIDENT_PRESSURE_PRESSURE EXCEEDED MOP, BUT DID NOT EXCEED 110% OF MOP"
0,4.0,0,1,0,0,0,0,0,0,1,0,0
1,2.0,0,1,0,0,0,0,0,0,1,0,0
2,3.0,1,0,0,0,0,0,0,0,1,0,0
3,3.0,0,1,0,0,0,0,0,0,1,0,0
4,3.0,1,0,0,0,0,0,0,0,1,0,0
5,2.0,0,1,0,0,0,0,0,0,1,0,0
6,0.0,0,0,0,0,0,0,1,0,1,0,0
7,56.0,0,0,0,0,0,0,1,0,1,0,0
8,9.0,0,1,0,0,0,0,0,0,1,0,0
9,51.0,0,0,0,0,0,0,0,1,1,0,0


In [75]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.85441917, -0.52074889,  1.16077243, ...,  0.15298217,
        -0.13016728, -0.07903095],
       [-0.93315882, -0.52074889,  1.16077243, ...,  0.15298217,
        -0.13016728, -0.07903095],
       [-0.893789  ,  1.92031136, -0.8614953 , ...,  0.15298217,
        -0.13016728, -0.07903095],
       ...,
       [ 1.11407225, -0.52074889, -0.8614953 , ...,  0.15298217,
        -0.13016728, -0.07903095],
       [ 0.79911363, -0.52074889, -0.8614953 , ...,  0.15298217,
        -0.13016728, -0.07903095],
       [-0.97252865, -0.52074889, -0.8614953 , ..., -6.53670953,
        -0.13016728, 12.65327085]])

In [76]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
X_pca

array([[-0.23947415, -1.59984114,  0.20611832],
       [-0.23121285, -1.64240873,  0.20828819],
       [-0.3843033 ,  0.77489572,  1.54182888],
       ...,
       [-0.43657717,  1.51338547, -2.22738046],
       [-0.44726001,  1.43385441, -1.40100741],
       [ 9.51022278,  0.37628989,  1.20226649]])

In [77]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca, columns=['PC1', 'PC2', 'PC3'], index = oilspill_df.index
)
pcs_df.head(10)

Unnamed: 0,PC1,PC2,PC3
0,-0.239474,-1.599841,0.206118
1,-0.231213,-1.642409,0.208288
2,-0.384303,0.774896,1.541829
3,-0.235343,-1.621125,0.207203
4,-0.384303,0.774896,1.541829
5,-0.231213,-1.642409,0.208288
6,-0.006989,-0.063056,-0.756419
7,-0.238305,1.128836,-0.817175
8,-0.260127,-1.493422,0.200694
9,-0.487856,1.269365,-1.145981


In [78]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [79]:
# Initialize the K-Means model with k=4 based on graph above.
model = KMeans(n_clusters=4, random_state=5)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
predictions

array([1, 1, 2, ..., 0, 0, 3], dtype=int32)

In [80]:
# Create a new DataFrame including predicted clusters and oil spill features.
# Concatentate the oilspill_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([oilspill_df, pcs_df], axis=1)

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(3061, 7)


Unnamed: 0,CAUSE,AGE_OF_FACILITY,ACCIDENT_PRESSURE,PC1,PC2,PC3,Class
0,EQUIPMENT FAILURE,4.0,PRESSURE DID NOT EXCEED MOP,-0.239474,-1.599841,0.206118,1
1,EQUIPMENT FAILURE,2.0,PRESSURE DID NOT EXCEED MOP,-0.231213,-1.642409,0.208288,1
2,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP,-0.384303,0.774896,1.541829,2
3,EQUIPMENT FAILURE,3.0,PRESSURE DID NOT EXCEED MOP,-0.235343,-1.621125,0.207203,1
4,CORROSION FAILURE,3.0,PRESSURE DID NOT EXCEED MOP,-0.384303,0.774896,1.541829,2
5,EQUIPMENT FAILURE,2.0,PRESSURE DID NOT EXCEED MOP,-0.231213,-1.642409,0.208288,1
6,OTHER INCIDENT CAUSE,0.0,PRESSURE DID NOT EXCEED MOP,-0.006989,-0.063056,-0.756419,0
7,OTHER INCIDENT CAUSE,56.0,PRESSURE DID NOT EXCEED MOP,-0.238305,1.128836,-0.817175,0
8,EQUIPMENT FAILURE,9.0,PRESSURE DID NOT EXCEED MOP,-0.260127,-1.493422,0.200694,1
9,OTHER OUTSIDE FORCE DAMAGE,51.0,PRESSURE DID NOT EXCEED MOP,-0.487856,1.269365,-1.145981,0


In [81]:
clustered_df['Class'].value_counts()

1    1559
0     781
2     651
3      70
Name: Class, dtype: int64

In [82]:
# Pull out class 1 information to look for 
class1 = clustered_df[clustered_df['Class'] == 1]
class1 = class1.drop(['AGE_OF_FACILITY', 'PC1', 'PC2', 'PC3'], axis=1)
class1.value_counts()

CAUSE                ACCIDENT_PRESSURE            Class
EQUIPMENT FAILURE    PRESSURE DID NOT EXCEED MOP  1        1290
INCORRECT OPERATION  PRESSURE DID NOT EXCEED MOP  1         269
dtype: int64

In [83]:
# Pull out class 0 information to look for 
class0 = clustered_df[clustered_df['Class'] == 0]
class0 = class0.drop(['AGE_OF_FACILITY', 'PC1', 'PC2', 'PC3'], axis=1)
class0.value_counts()

CAUSE                             ACCIDENT_PRESSURE            Class
MATERIAL FAILURE OF PIPE OR WELD  PRESSURE DID NOT EXCEED MOP  0        262
NATURAL FORCE DAMAGE              PRESSURE DID NOT EXCEED MOP  0        147
EXCAVATION DAMAGE                 PRESSURE DID NOT EXCEED MOP  0        134
INCORRECT OPERATION               PRESSURE DID NOT EXCEED MOP  0        107
OTHER INCIDENT CAUSE              PRESSURE DID NOT EXCEED MOP  0         66
OTHER OUTSIDE FORCE DAMAGE        PRESSURE DID NOT EXCEED MOP  0         65
dtype: int64

In [84]:
# Pull out class 2 information to look for 
class2 = clustered_df[clustered_df['Class'] == 2]
class2 = class2.drop(['AGE_OF_FACILITY', 'PC1', 'PC2', 'PC3'], axis=1)
class2.value_counts()

CAUSE              ACCIDENT_PRESSURE            Class
CORROSION FAILURE  PRESSURE DID NOT EXCEED MOP  2        651
dtype: int64

In [85]:
# Pull out class 3 information to look for 
class3 = clustered_df[clustered_df['Class'] == 3]
class3 = class3.drop(['AGE_OF_FACILITY', 'PC1', 'PC2', 'PC3'], axis=1)
class3.value_counts()

CAUSE                             ACCIDENT_PRESSURE                                      Class
INCORRECT OPERATION               PRESSURE EXCEEDED 110% OF MOP                          3        35
                                  PRESSURE EXCEEDED MOP, BUT DID NOT EXCEED 110% OF MOP  3        12
EQUIPMENT FAILURE                 PRESSURE EXCEEDED 110% OF MOP                          3        10
                                  PRESSURE EXCEEDED MOP, BUT DID NOT EXCEED 110% OF MOP  3         4
MATERIAL FAILURE OF PIPE OR WELD  PRESSURE EXCEEDED 110% OF MOP                          3         3
CORROSION FAILURE                 PRESSURE EXCEEDED 110% OF MOP                          3         1
                                  PRESSURE EXCEEDED MOP, BUT DID NOT EXCEED 110% OF MOP  3         1
EXCAVATION DAMAGE                 PRESSURE EXCEEDED 110% OF MOP                          3         1
NATURAL FORCE DAMAGE              PRESSURE EXCEEDED MOP, BUT DID NOT EXCEED 110% OF MOP  3       

In [88]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(clustered_df, x="PC1", y="PC2", z="PC3", color="Class", hover_name="CAUSE", hover_data=["AGE_OF_FACILITY", "ACCIDENT_PRESSURE"], width=900)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [87]:
# Export model to csv file
clustered_df.to_csv('ml_model.csv', index=True)