In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
np.random.seed(42)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')

%matplotlib inline

In [37]:
class LoadData:
    def __init__(self):
        self.data_path = data_path
        self.full_data = self.csv_files()

    def csv_files(self):
        csv_files = [_ for _ in os.listdir(self.data_path) if _.endswith('.csv')]
        data_frames = []

        for idx,file in enumerate(csv_files):
            data = pd.read_csv(os.path.join(self.data_path,file))
            data['Timestamp'] = pd.to_datetime(data['Timestamp'],dayfirst=True)
            data = data.rename(columns={data.columns[2]:'synthetic_data'})
            data['Anomaly'] = data['Anomaly'].astype('int64')
            data  = data.drop('original_signal',axis=1)
            data_frames.append(data)

        full_data = pd.concat(data_frames,ignore_index=True) 
        return full_data

    def display_null_count_and_dtypes(self):
        # Display the null counts
        null_counts = self.full_data.isnull().sum()
        print(f'Null Count:')
        for col, count in null_counts.items():
            print(f'{col}: {count}')
        
        print(f'\nData Types:')
        data_types = self.full_data.dtypes
        for col, dtype in data_types.items():
            print(f'{col}: {dtype}')

    def period_of_time(self):
        date_min = self.full_data['Timestamp'].min()
        date_max = self.full_data['Timestamp'].max()
        date_range = (date_max - date_min).days  
        
        print(f'\nDate Range:')
        print(f'Start:\t{date_min}')
        print(f'End:\t{date_max}')
        print(f'Days:\t{date_range} days')

In [38]:
data_path = "/media/magesh/HardDisk/Thesis/anomaly_detection/data/processed/ml_data"

data = LoadData()

data.period_of_time()


Date Range:
Start:	2023-05-11 00:10:00
End:	2023-06-10 11:10:00
Days:	30 days


In [None]:
combined_df['Timestamp'] = pd.to_datetime(combined_df['Timestamp'],dayfirst=True)
combined_df['Day'] = combined_df['Timestamp'].dt.day
combined_df['Month'] = combined_df['Timestamp'].dt.month
combined_df['Year'] = combined_df['Timestamp'].dt.year
combined_df['Hour'] = combined_df['Timestamp'].dt.hour
combined_df['Minute'] = combined_df['Timestamp'].dt.minute


columns_order = ['Day', 'Month', 'Year', 'Hour', 'Minute', 'synthetic_signal',
                 'step_variable_ws5', 'step_variable_ws10', 'step_variable_ws15',
                 'std_anomaly_ws5', 'std_anomaly_ws10', 'std_anomaly_ws15',
                 'iqr_anomaly_ws5', 'iqr_anomaly_ws10', 'iqr_anomaly_ws15',
                 'Anomaly']

combined_df = combined_df[columns_order]

In [None]:
combined_df.isna().sum()

In [None]:
ax = sns.countplot(data=combined_df, x='Anomaly', palette='viridis')  

labels = (combined_df.Anomaly.value_counts().values)

for bar, label in zip(ax.patches, labels):
    height = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f'{label}',
        ha='center',
        va='bottom'
    )

plt.title('Count of Anomalies')
plt.show()


In [None]:
X = combined_df.drop('Anomaly',axis = 1)
X

In [None]:
y = combined_df['Anomaly']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y) # 0.8 / 0.2 = 1

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25) # 0.25 x 0.8 = 0.2

In [None]:
X_train.shape,X_val.shape,X_test.shape

In [None]:
def build_simple_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Initialize and train the model
    model = SVC()
    model.fit(X_train, y_train)
    
    # Validate the model
    y_val_pred = model.predict(X_val)
    val_accuracy = roc_auc_score(y_val, y_val_pred)
    print(f"Validation ROC score: {val_accuracy:.4f}")
    
    # Test the model
    y_test_pred = model.predict(X_test)
    test_accuracy = roc_auc_score(y_test, y_test_pred)
    print(f"Test ROC score: {test_accuracy:.4f}")
    
    # Print additional metrics
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    
# This is simple SVC Model
model = build_simple_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
def build_simple_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Initialize and train the model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # Validate the model
    y_val_pred = model.predict(X_val)
    val_accuracy = roc_auc_score(y_val, y_val_pred)
    print(f"Validation ROC score: {val_accuracy:.4f}")
    
    # Test the model
    y_test_pred = model.predict(X_test)
    test_accuracy = roc_auc_score(y_test, y_test_pred)
    print(f"Test ROC score: {test_accuracy:.4f}")
    
    # Print additional metrics
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# Simple Random Forest CLassifier Model
model = build_simple_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
def build_model_scaler(X_train, X_val, X_test, y_train, y_val, y_test):
    
    # Create a pipeline with scaling and model
    model = Pipeline(
        [ ('scaler',RobustScaler()),
        ('svc',SVC())
        ]
    )
    model.fit(X_train, y_train)
    
    # Validate the model
    y_val_pred = model.predict(X_val)
    val_accuracy = roc_auc_score(y_val, y_val_pred)
    print(f"Validation ROC score: {val_accuracy:.4f}")
    
    # Test the model
    y_test_pred = model.predict(X_test)
    test_accuracy = roc_auc_score(y_test, y_test_pred)
    print(f"Test ROC score: {test_accuracy:.4f}")
    
    # Print additional metrics
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    
# SVC model with Robust Scaler
model = build_model_scaler(X_train, X_val, X_test, y_train, y_val, y_test)


In [None]:
def build_model_with_hyperparameter_tuning(X_train, X_val, X_test, y_train, y_val, y_test):
    # Define a parameter grid for RandomizedSearchCV
    param_grid = {
        'svc__C': [0.1, 1, 10, 100],
        'svc__kernel': ['linear', 'rbf'],
        'svc__class_weight': [None, 'balanced']
    }
    
    # Construct the pipeline
    pipe = Pipeline([
        ('scaler', RobustScaler()),
        ('svc', SVC(probability=True))
    ])
    
    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
    
    # Train the model with RandomizedSearchCV
    random_search.fit(X_train, y_train)
    
    # Best model from RandomizedSearchCV
    best_model = random_search.best_estimator_
    
    # Make predictions on validation data
    y_val_pred = best_model.predict(X_val)
    val_accuracy = roc_auc_score(y_val, y_val_pred)
    print(f"Validation ROC score: {val_accuracy:.4f}")

    # Test the model
    y_test_pred = best_model.predict(X_test)
    test_accuracy = roc_auc_score(y_test, y_test_pred)
    print(f"Test ROC score: {test_accuracy:.4f}")
    
    # Print additional metrics
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    
    
    print(f"Best SVC Hyperparameters: {random_search.best_params_}")
    return best_model

# SVC model With Hyper Parameter Tuning:
best_model = build_model_with_hyperparameter_tuning(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
def build_model_with_hyperparameter_tuning(X_train, X_val, X_test, y_train, y_val, y_test):
    
    # Define a parameter grid for RandomizedSearchCV
    param_grid = [{'n_estimators' : list(range(100,110)), 
                      'max_depth': list(range(10, 15)), 
                      'max_features': list(range(0,14))}
                    ]
    
    # Initialize the model and RandomizedSearchCV
    model = RandomForestClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc')
    
    # Train the model with RandomizedSearchCV
    random_search.fit(X_train, y_train)

    # Best model from RandomizedSearchCV
    best_model = random_search.best_estimator_
    
    # Make predictions and evaluate
    y_pred = best_model.predict(X_val)
    val_accuracy = roc_auc_score(y_val, y_pred)
    print(f"Validation ROC score: {val_accuracy:.4f}")
    
    # Test the model
    y_test_pred = best_model.predict(X_test)
    test_accuracy = roc_auc_score(y_test, y_test_pred)
    print(f"Test ROC score: {test_accuracy:.4f}")
    
    # Print additional metrics
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    
    print(f"Best RF Hyperparameters: {random_search.best_params_}")
    return best_model_

rf_hyperparameter = build_model_with_hyperparameter_tuning(X_train, X_val, X_test, y_train, y_val, y_test)
rf_hyperparameter


In [None]:

    # # Calculate the ROC curve
    # fpr, tpr, thresholds = roc_curve(y_test, y_test_probs)
    
    # # Plot the ROC curve
    # plt.figure()
    # plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    # plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    # plt.xlim([0.0, 1.0])
    # plt.ylim([0.0, 1.05])
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('Receiver Operating Characteristic (ROC) Curve')
    # plt.legend(loc='lower right')
    # plt.show()