In [5]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib
import random
import logging

# Configure logging
logging.basicConfig(filename='grid_search.log', level=logging.INFO, format='%(asctime)s %(message)s')

class LoggingCallback:
    def __call__(self, *args, **kwargs):
        logging.info('GridSearchCV: {}'.format(kwargs.get('msg', '')))

class WindowRandomForest:
    def __init__(self, n_list, n_estimators=100, random_state=42):
        if not all(n % 2 != 0 for n in n_list):
            raise ValueError("All values in n_list must be odd numbers.")
        self.n_list = n_list
        self.model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
        self.scaler = StandardScaler()
        self.random_state = random_state
        random.seed(self.random_state)
        print("Initialized WindowRandomForest with n_list:", n_list)

    def prepare_data(self, data, random_seed=None):
        if random_seed is not None:
            random.seed(random_seed)
        
        X = []
        y = []
        
        i = 0
        data_len = len(data)
        min_window = min(self.n_list)
        
        print("Preparing data...")
        while i <= data_len - min_window:
            current_n = random.choice(self.n_list)
            if i + current_n > data_len:
                break
            
            window = data.iloc[i:i+current_n, :-1]
            label_window = data.iloc[i:i+current_n, -1]
            
            majority_label = mode(label_window)[0]
            aggregated_features = self.aggregate_features(window)
            
            X.append(aggregated_features)
            y.append(majority_label)
            
            i += current_n
    
        print(f"Prepared {len(X)} windows.")
        return np.array(X), np.array(y)

    def aggregate_features(self, window):
        #print("Aggregating features for window of size:", len(window))
        return window.mean(axis=0)

    def fit(self, train_data):
        print("Fitting model...")
        X, y = self.prepare_data(train_data)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        self.scaler = StandardScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_val = self.scaler.transform(X_val)

        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }

        grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train, callbacks=[LoggingCallback()])

        self.model = grid_search.best_estimator_
        print("Best parameters found by GridSearchCV:", grid_search.best_params_)

        y_val_pred = []
        y_val_pred_proba = []

        for row in X_val:
            y_val_pred.append(self.predict_single_row(row))
            y_val_pred_proba.append(self.model.predict_proba(self.scaler.transform(row.reshape(1, -1)))[0][1])
        
        print('Validation Classification Report:')
        print(classification_report(y_val, y_val_pred))
        
        roc_auc = roc_auc_score(y_val, y_val_pred_proba)
        print(f'ROC AUC Score: {roc_auc}')

        fpr, tpr, _ = roc_curve(y_val, y_val_pred_proba)
        plt.figure()
        plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='best')
        plt.show()

    def evaluate(self, test_data):
        print("Evaluating model...")
        X_test, y_test = self.prepare_data(test_data)

        y_test_pred = []
        y_test_pred_proba = []

        for row in X_test:
            y_test_pred.append(self.predict_single_row(row))
            y_test_pred_proba.append(self.model.predict_proba(self.scaler.transform(row.reshape(1, -1)))[0][1])
        
        print('Test Classification Report:')
        print(classification_report(y_test, y_test_pred))
        
        roc_auc = roc_auc_score(y_test, y_test_pred_proba)
        print(f'ROC AUC Score: {roc_auc}')

        fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
        plt.figure()
        plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='best')
        plt.show()

    def predict(self, data, row_index):
        if row_index < min(self.n_list) - 1:
            raise ValueError(f"Not enough data to form a window of {min(self.n_list)} rows for prediction.")
        
        window = data.iloc[row_index-(min(self.n_list)-1):row_index+1, :-1]
        aggregated_features = self.aggregate_features(window)
        
        aggregated_features = self.scaler.transform([aggregated_features])
        prediction = self.model.predict(aggregated_features)
        return prediction[0]

    def predict_single_row(self, row):
        row = self.scaler.transform(row.reshape(1, -1))
        return self.model.predict(row)[0]

    def save_model(self, model_path):
        joblib.dump({
            'model': self.model,
            'scaler': self.scaler,
            'n_list': self.n_list
        }, model_path)
        print(f'Model saved to {model_path}')

    def load_model(self, model_path):
        saved_objects = joblib.load(model_path)
        self.model = saved_objects['model']
        self.scaler = saved_objects['scaler']
        self.n_list = saved_objects['n_list']
        print(f'Model loaded from {model_path}')


In [6]:
import pandas as pd
df=pd.read_csv('Normal_Abnormal_eval_0.2_Eval_1sec_hamming.csv')

n_list = [1,3, 5, 7, 9] 
wrf = WindowRandomForest(n_list)

# Train the model
wrf.fit(df)

# Evaluate the model on the test set
wrf.evaluate()

# Save the model to a file
model_path = 'wrf_model.pkl'
wrf.save_model(model_path)

# Create a new instance of the model
wrf_loaded = WindowRandomForest(n=n)

# Load the model from the file
wrf_loaded.load_model(model_path)

single_row_index = 1  
single_row = df.iloc[single_row_index, :-1].values 

predicted_label = wrf_loaded.predict_single_row(single_row)
print(f'Predicted Label for row {single_row_index}: {predicted_label}')

Initialized WindowRandomForest with n_list: [1, 3, 5, 7, 9]
Fitting model...
Preparing data...


  0%|          | 0/2160 [02:23<?, ?it/s]


Prepared 92693 windows.
Fitting 10 folds for each of 216 candidates, totalling 2160 fits


ValueError: 
All the 2160 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KARAN\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KARAN\anaconda3\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
TypeError: BaseForest.fit() got an unexpected keyword argument 'callbacks'
