In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [16]:
alerts_df = pd.read_excel('datasets/alerts.xlsx')
alerts_df.head()

Unnamed: 0,alert_timestamp,Alert/Action
0,2022-03-24 07:00:05.723000+00:00,Alert type 1
1,2022-03-24 07:00:05.813000+00:00,Alert type 5
2,2022-03-24 07:00:07.723000+00:00,Alert type 5
3,2022-03-24 07:00:09.723000+00:00,Alert type 6
4,2022-03-24 07:00:11.723000+00:00,Alert type 5


In [17]:
#The number of alerts seems to be repeated in some cases: type 1 = type 4, type 10 = type 11 = type 13, etc.
alerts_df['Alert/Action'].value_counts().sort_index()

Action          3
Alert type 1    1
Alert type 5    6
Alert type 6    1
Name: Alert/Action, dtype: int64

In [18]:
#Validating null values
alerts_df.isnull().sum()

alert_timestamp    0
Alert/Action       0
dtype: int64

In [19]:
alerts_df["Alert/Action"] = alerts_df["Alert/Action"].astype("category")

In [20]:
#Converting to datetime timezone aware without msecs
alerts_df["alert_timestamp"] = pd.to_datetime(alerts_df["alert_timestamp"], infer_datetime_format=True, utc=True).dt.floor('3T')
print(alerts_df.head())

            alert_timestamp  Alert/Action
0 2022-03-24 07:00:00+00:00  Alert type 1
1 2022-03-24 07:00:00+00:00  Alert type 5
2 2022-03-24 07:00:00+00:00  Alert type 5
3 2022-03-24 07:00:00+00:00  Alert type 6
4 2022-03-24 07:00:00+00:00  Alert type 5


In [23]:
alerts_df = pd.get_dummies(alerts_df)
print(alerts_df.head())

            alert_timestamp  Alert/Action_Action  Alert/Action_Alert type 1  \
0 2022-03-24 07:00:00+00:00                    0                          1   
1 2022-03-24 07:00:00+00:00                    0                          0   
2 2022-03-24 07:00:00+00:00                    0                          0   
3 2022-03-24 07:00:00+00:00                    0                          0   
4 2022-03-24 07:00:00+00:00                    0                          0   

   Alert/Action_Alert type 5  Alert/Action_Alert type 6  
0                          0                          0  
1                          1                          0  
2                          1                          0  
3                          0                          1  
4                          1                          0  


In [24]:
pivot = pd.pivot_table(alerts_df, index=['alert_timestamp'], aggfunc=np.max, fill_value=0)
print(pivot)

                           Alert/Action_Action  Alert/Action_Alert type 1  \
alert_timestamp                                                             
2022-03-24 07:00:00+00:00                    0                          1   
2022-03-24 07:03:00+00:00                    1                          0   
2022-03-24 07:06:00+00:00                    1                          0   

                           Alert/Action_Alert type 5  \
alert_timestamp                                        
2022-03-24 07:00:00+00:00                          1   
2022-03-24 07:03:00+00:00                          1   
2022-03-24 07:06:00+00:00                          0   

                           Alert/Action_Alert type 6  
alert_timestamp                                       
2022-03-24 07:00:00+00:00                          1  
2022-03-24 07:03:00+00:00                          0  
2022-03-24 07:06:00+00:00                          0  


In [25]:
#Creating logistic regression model
logreg = LogisticRegression()

In [26]:
#Segregate features and labels into separate variables
X = pivot.loc[:, pivot.columns != "Alert/Action_Action"]
y = pivot.loc[:, pivot.columns == "Alert/Action_Action"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [32]:
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

logreg.fit(X_train, y_train.values.ravel())
y_pred = logreg.predict(X_test)
np.exp(logreg.coef_)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1