In [None]:
# Uncomment and run the following lines if you need to install any of these packages
# !pip install pandas numpy scikit-learn imbalanced-learn matplotlib seaborn joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib

# ---------------------------- 1. Load Data ----------------------------

# Replace 'your_data.csv' with the actual path to your data file
# Assuming the data is tab-separated as per the sample provided
df = pd.read_excel("data/dataset.xlsx", engine="openpyxl")
# Display first few rows to verify
print("Initial Data Sample:")
display(df.head())

# --------------------- 2. Handle Decimal Separators ---------------------

# Identify numerical columns (excluding 'Num', 'Timestamp', 'Robot_ProtectiveStop', 'grip_lost')
numerical_cols = df.columns.drop(['Num', 'Timestamp', 'Robot_ProtectiveStop', 'grip_lost'])

# Replace commas with dots and convert to float
for col in numerical_cols:
    df[col] = df[col].astype(str).str.replace(',', '.', regex=False).astype(float)

# Verify the conversion
print("\nData Types After Conversion:")
print(df.dtypes)

# --------------------- 3. Encode Categorical Variables ---------------------

bool_mapping = {False: 0, True: 1}
df['Robot_ProtectiveStop'] = df['Robot_ProtectiveStop'].map(bool_mapping)
df['grip_lost'] = df['grip_lost'].map(bool_mapping)

# Verify the mapping
print("\nUnique values after encoding 'Robot_ProtectiveStop' and 'grip_lost':")
print(df[['Robot_ProtectiveStop', 'grip_lost']].nunique())

# ------------------------- 4. Process Timestamp -------------------------

# Convert 'Timestamp' to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract useful time-based features
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second
df['Microsecond'] = df['Timestamp'].dt.microsecond

# Optionally, create a feature for elapsed time in seconds
df = df.sort_values('Timestamp')  # Ensure data is sorted by time
df['Elapsed_Time'] = (df['Timestamp'] - df['Timestamp'].min()).dt.total_seconds()

# Drop the original 'Timestamp' column
df = df.drop('Timestamp', axis=1)

# Display the new features
print("\nData Sample After Timestamp Processing:")
display(df.head())

# ----------------------- 5. Handle Missing Values -----------------------

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Handle missing values
# For simplicity, we'll drop rows with any missing values
df = df.dropna()

# Verify that there are no missing values left
print("\nMissing Values After Dropping:")
print(df.isnull().sum())

# ------------------------ 6. Feature Selection -------------------------

# Define feature columns and target
# Exclude 'Num' and 'grip_lost' from features
feature_cols = df.columns.drop(['Num', 'grip_lost'])
X = df[feature_cols]
y = df['grip_lost']

print("\nFeature Columns:")
print(feature_cols.tolist())

# -------------------- 7. Exploratory Data Analysis (EDA) ----------------

# Plot correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(df[feature_cols].corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

# Distribution of target variable
sns.countplot(x=y)
plt.title('Distribution of grip_lost')
plt.xlabel('Grip Lost (0 = False, 1 = True)')
plt.ylabel('Count')
plt.show()

# If there's class imbalance, it will be evident from the count plot

# ------------------------- 8. Split the Data ----------------------------

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining and Testing Set Sizes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

# ------------------------ 9. Feature Scaling ----------------------------

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------- 10. Handle Class Imbalance -----------------------

# Check class distribution in training set
print("\nClass Distribution in y_train:")
print(y_train.value_counts())

# Initialize SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check the new class distribution
print("\nClass Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# ------------------------ 11. Model Training ----------------------------

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on resampled data
rf.fit(X_train_resampled, y_train_resampled)

# ------------------------ 12. Initial Evaluation ------------------------

# Make predictions on the test set
y_pred = rf.predict(X_test_scaled)
y_proba = rf.predict_proba(X_test_scaled)[:, 1]

# Print classification report
print("\nClassification Report (Initial Model):")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Initial Model)')
plt.show()

# Print ROC AUC Score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC Score (Initial Model): {roc_auc:.2f}")

# -------------------- 13. Hyperparameter Tuning -------------------------

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',  # Using F1-score for imbalanced data
    verbose=2
)

# Perform grid search on resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print best parameters
print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)

# Get the best estimator
best_rf = grid_search.best_estimator_

# --------------------- 14. Final Model Evaluation -----------------------

# Make predictions with the best model
y_pred_best = best_rf.predict(X_test_scaled)
y_proba_best = best_rf.predict_proba(X_test_scaled)[:, 1]

# Print classification report
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

# Plot confusion matrix
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_best, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Best Model)')
plt.show()

# Print ROC AUC Score
roc_auc_best = roc_auc_score(y_test, y_proba_best)
print(f"ROC AUC Score (Best Model): {roc_auc_best:.2f}")

# ----------------------- 15. Feature Importance --------------------------

# Get feature importances from the best model
importances = best_rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feat_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feat_importances)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# ------------------------- 16. Save the Model ----------------------------

# Save the scaler and the best model using joblib
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(best_rf, 'random_forest_model.joblib')

print("\nModel and Scaler have been saved as 'random_forest_model.joblib' and 'scaler.joblib' respectively.")

# ------------------------ 17. Load and Predict (Optional) ----------------

# Example of loading the saved model and scaler for future predictions
# Uncomment the following lines to use them

# loaded_scaler = joblib.load('scaler.joblib')
# loaded_model = joblib.load('random_forest_model.joblib')

# # Example new data (ensure it has the same preprocessing)
# # Replace 'new_data.csv' with your actual new data file path
# new_df = pd.read_csv('new_data.csv', delimiter='\t')

# # Preprocess new data similarly
# for col in numerical_cols:
#     new_df[col] = new_df[col].astype(str).str.replace(',', '.', regex=False).astype(float)
# new_df['Robot_ProtectiveStop'] = new_df['Robot_ProtectiveStop'].map(bool_mapping)
# new_df['grip_lost'] = new_df['grip_lost'].map(bool_mapping)  # If available

# # Process Timestamp if present
# if 'Timestamp' in new_df.columns:
#     new_df['Timestamp'] = pd.to_datetime(new_df['Timestamp'])
#     new_df['Hour'] = new_df['Timestamp'].dt.hour
#     new_df['Minute'] = new_df['Timestamp'].dt.minute
#     new_df['Second'] = new_df['Timestamp'].dt.second
#     new_df['Microsecond'] = new_df['Timestamp'].dt.microsecond
#     new_df = new_df.sort_values('Timestamp')
#     new_df['Elapsed_Time'] = (new_df['Timestamp'] - new_df['Timestamp'].min()).dt.total_seconds()
#     new_df = new_df.drop('Timestamp', axis=1)

# # Define features (ensure they match the training features)
# new_X = new_df[feature_cols]

# # Scale the new data
# new_X_scaled = loaded_scaler.transform(new_X)

# # Make predictions
# new_predictions = loaded_model.predict(new_X_scaled)
# new_probabilities = loaded_model.predict_proba(new_X_scaled)[:, 1]

# print("\nNew Predictions:")
# print(new_predictions)
# print("\nNew Prediction Probabilities:")
# print(new_probabilities)


FileNotFoundError: [Errno 2] No such file or directory: 'your_data.csv'