In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Step 1: Load your dataset
file_path = 'Featured_dataset.csv'  # Specify your full file path here

try:
    # Load dataset
    data = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    print(data.head())
    
except FileNotFoundError:
    print(f"File '{file_path}' not found. Please check the file path.")
    exit()  # Exit the script if the file is not found

# Step 2: Data Preprocessing
# Check the column names and inspect the data types
print("\nColumn Names in the Dataset:\n", data.columns)

# Convert non-numeric columns (e.g., timestamps or categorical data) to numeric values
# Example: Convert timestamps to numerical representation (e.g., seconds since a reference date)

# If 'timestamp' column exists, process it
if 'timestamp' in data.columns:
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['timestamp'] = (data['timestamp'] - data['timestamp'].min()).dt.total_seconds()
else:
    print("Warning: No 'timestamp' column found. Skipping timestamp processing.")

# Handle categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# For simplicity, let's encode categorical columns to numeric using LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Separate features and target label
X = data.drop(columns=['label'])  # Features (all columns except 'label')
y = data['label']  # Labels (0 - normal, 1 - anomaly)

# Scale features (important for models like IsolationForest or tree-based models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Handle Class Imbalance (if applicable)
# Compute class weights for balancing the data during training
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Step 4: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 5: Model Training using RandomForestClassifier (can also be Isolation Forest or One-Class SVM)
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weight_dict)
model.fit(X_train, y_train)

# Step 6: Evaluate the model (Before Tuning)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of the positive class (anomalous)

# Get initial classification report (Precision, Recall, F1 Score)
initial_class_report = classification_report(y_test, y_pred, output_dict=True)
initial_precision = initial_class_report['1']['precision']
initial_recall = initial_class_report['1']['recall']
initial_f1 = initial_class_report['1']['f1-score']

print("Initial Classification Report (Before Tuning):")
print(classification_report(y_test, y_pred))

# Step 7: Model Tuning with GridSearchCV (optional, improves model performance)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best parameters and best model
best_model = grid_search.best_estimator_
print(f"Best model parameters: {grid_search.best_params_}")

# Step 8: Evaluate the tuned model
y_pred_tuned = best_model.predict(X_test)
y_pred_proba_tuned = best_model.predict_proba(X_test)[:, 1]

# Get tuned classification report
tuned_class_report = classification_report(y_test, y_pred_tuned, output_dict=True)
tuned_precision = tuned_class_report['1']['precision']
tuned_recall = tuned_class_report['1']['recall']
tuned_f1 = tuned_class_report['1']['f1-score']

print("Tuned Classification Report (After Tuning):")
print(classification_report(y_test, y_pred_tuned))

# Step 9: Log the Changes in Precision, Recall, F1-Score
# Plot comparison of metrics before and after tuning
metrics = ['Precision', 'Recall', 'F1-Score']
before = [initial_precision, initial_recall, initial_f1]
after = [tuned_precision, tuned_recall, tuned_f1]

# Create a bar plot for metric comparison
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35  # width of the bars
x = np.arange(len(metrics))  # x-ticks
rects1 = ax.bar(x - width/2, before, width, label='Before Tuning', color='lightblue')
rects2 = ax.bar(x + width/2, after, width, label='After Tuning', color='lightgreen')

# Add some text for labels, title and custom x-axis tick labels
ax.set_xlabel('Metrics')
ax.set_title('Comparison of Evaluation Metrics: Before vs After Tuning')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Label with label_type 'center' to display values inside the bars
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(rects1)
add_labels(rects2)

plt.tight_layout()
plt.show()

# Optionally, you could also log the change in metrics as a print statement
print(f"\n--- Changes in Metrics (Before vs After Tuning) ---")
print(f"Change in Precision: {tuned_precision - initial_precision:.2f}")
print(f"Change in Recall: {tuned_recall - initial_recall:.2f}")
print(f"Change in F1-Score: {tuned_f1 - initial_f1:.2f}")


Dataset loaded successfully!
   Longitude   Latitude  Speed  Distance                 Time     Acc X  \
0  73.822661  18.501627    0.0       0.0  1900-01-01 18:45:12  0.046402   
1  73.822661  18.501627    0.0       0.0  1900-01-01 18:45:12 -0.136978   
2  73.822661  18.501627    0.0       0.0  1900-01-01 18:45:13 -0.045355   
3  73.822661  18.501627    0.0       0.0  1900-01-01 18:45:13  0.242089   
4  73.822661  18.501627    0.0       0.0  1900-01-01 18:45:13 -0.230234   

      Acc Y     Acc Z  Heading    gyro_x  ...  Gyro_Change  Net_Displacement  \
0 -0.137178 -0.282934      352 -0.036306  ...     0.000000               0.0   
1  0.365242  0.108889      352  0.035776  ...    -0.005758               0.0   
2 -0.103340 -0.534985      352 -0.011871  ...    -0.007818               0.0   
3  0.072761 -0.350396      352 -0.017980  ...    -0.011512               0.0   
4  0.011765 -0.494085      352  0.011342  ...    -0.005381               0.0   

   Speed_Change  Heading_Change  Rollin

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.