# Random Forest Model for Anomaly Detection
This notebook demonstrates the process of building a Random Forest classifier to detect anomalies in user interactions with movie items. The dataset contains interactions such as 'like', 'dislike', 'neutral', and 'watched' ratings.
### Steps:
- Data Loading
- Feature Engineering
- Model Training (Random Forest)
- Model Evaluation (AUC, Accuracy, and Confusion Matrix)


In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the dataset
data = np.load('first_batch_multi_labels.npz')
X = pd.DataFrame(data['X'], columns=['user', 'item', 'rating'])
yy = pd.DataFrame(data['yy'], columns=['user', 'label'])

# Feature Engineering: User-level Aggregation
user_features = X.groupby('user').agg({
    'rating': ['count', lambda x: (x == 10).sum(), lambda x: (x == -10).sum(), lambda x: (x == 0).sum(), lambda x: (x == 1).sum()]
}).reset_index()
user_features.columns = ['user', 'interaction_count', 'like_count', 'dislike_count', 'neutral_count', 'watched_count']

# Create ratios for better features
user_features['like_ratio'] = user_features['like_count'] / user_features['interaction_count']
user_features['dislike_ratio'] = user_features['dislike_count'] / user_features['interaction_count']
user_features['neutral_ratio'] = user_features['neutral_count'] / user_features['interaction_count']
user_features['watched_ratio'] = user_features['watched_count'] / user_features['interaction_count']

# Merge with labels
merged_data = pd.merge(user_features, yy, on='user')


### Train-Test Split and Random Forest Model

In [3]:
# Split data into features and labels
X = merged_data.drop('label', axis=1)
y = merged_data['label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train RandomForestClassifier
clf = RandomForestClassifier(class_weight='balanced', n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
clf.fit(X_train, y_train)


RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_split=5, n_estimators=200, random_state=42)

### Model Evaluation

In [4]:
# Predict probabilities and calculate AUC
y_prob = clf.predict_proba(X_test)
auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print(f"AUC Score: {auc}")

# Predict labels and calculate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


AUC Score: 0.9862023809523809
Accuracy: 0.9590909090909091


### Classification Report

In [5]:


# Classification Report
report = classification_report(y_test, y_pred, target_names=['Normal', 'Anomaly 1', 'Anomaly 2'])
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

      Normal       0.98      0.97      0.98       200
   Anomaly 1       0.60      0.60      0.60        10
   Anomaly 2       0.91      1.00      0.95        10

    accuracy                           0.96       220
   macro avg       0.83      0.86      0.84       220
weighted avg       0.96      0.96      0.96       220

