<a href="https://colab.research.google.com/github/Kdavis2025/Automating-Compliance-AI-and-Machine-Learning-Approaches-to-Achieviing-CMMC-2.0-Certification/blob/main/Cyber_Security_Awareness_and_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- Upload Widgets ---
historical_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload historical_metrics.csv'
)

quiz_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload quiz_results.csv'
)

interaction_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload interaction_logs.csv'
)

display(historical_uploader, quiz_uploader, interaction_uploader)

# --- Load Data Function ---
def load_uploaded_csv(uploader):
    if uploader.value:
        name = list(uploader.value.keys())[0]
        content = uploader.value[name]['content']
        return pd.read_csv(pd.io.common.BytesIO(content))
    else:
        return None

# Wait for uploads, then process
def on_upload_change(change):
    # Ensure all three files are uploaded
    if historical_uploader.value and quiz_uploader.value and interaction_uploader.value:
        # Clear the output to display results
        clear_output()

        # Load dataframes
        historical_df = load_uploaded_csv(historical_uploader)
        quiz_df = load_uploaded_csv(quiz_uploader)
        interaction_df = load_uploaded_csv(interaction_uploader)

        # Display head of each for confirmation
        print("Historical Metrics (first 5 rows):")
        display(historical_df.head())

        print("Quiz Results (first 5 rows):")
        display(quiz_df.head())

        print("Interaction Logs (first 5 rows):")
        display(interaction_df.head())

        # --- Feature Engineering per User ---
        # 1. From historical metrics: use average AwarenessScore and PhishingClickRate
        user_hist = historical_df.groupby('UserID').agg({
            'AwarenessScore': 'mean',
            'PhishingClickRate': 'mean',
            'ModulesCompleted': 'sum',
            'AvgTimePerModule': 'mean'
        }).rename(columns={
            'AwarenessScore': 'AvgAwarenessScore',
            'PhishingClickRate': 'AvgClickRate',
            'ModulesCompleted': 'TotalModules',
            'AvgTimePerModule': 'MeanModuleTime'
        })

        # 2. From quiz results: compute accuracy rate and avg response time per user
        quiz_df['IsCorrect'] = quiz_df['IsCorrect'].astype(int)
        user_quiz = quiz_df.groupby('UserID').agg({
            'IsCorrect': 'mean',
            'ResponseTimeSec': 'mean'
        }).rename(columns={
            'IsCorrect': 'QuizAccuracy',
            'ResponseTimeSec': 'MeanResponseTime'
        })

        # 3. From interaction logs: count phishing simulations, hint usage, total interactions
        interaction_df['PhishingAttempt'] = interaction_df['InteractionType'].apply(lambda x: 1 if x == 'phishing_sim' else 0)
        interaction_df['HintUsed'] = interaction_df['HintRequested'].astype(int)
        user_inter = interaction_df.groupby('UserID').agg({
            'PhishingAttempt': 'sum',
            'HintUsed': 'sum',
            'InteractionType': 'count'
        }).rename(columns={
            'PhishingAttempt': 'TotalPhishingSims',
            'HintUsed': 'TotalHints',
            'InteractionType': 'TotalInteractions'
        })

        # Merge features
        features = user_hist.join(user_quiz, how='left').join(user_inter, how='left').fillna(0)

        # Create readiness label: AvgAwarenessScore >= 75 => Ready (1), else Not Ready (0)
        features['ReadyLabel'] = features['AvgAwarenessScore'].apply(lambda x: 1 if x >= 75 else 0)

        # Display engineered feature table
        print("Engineered Features per User:")
        display(features)

        # --- Model Training ---
        X = features.drop(columns=['ReadyLabel'])
        y = features['ReadyLabel']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # --- Evaluation ---
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)

        print("Model Performance on Test Set:")
        print(f"Accuracy: {acc:.2f}")
        print(f"Precision: {prec:.2f}")
        print(f"Recall: {rec:.2f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=0))

        # --- Readiness Effectiveness Analysis ---
        # Compare average phishing click rate for Ready vs Not Ready
        ready_group = features[features['ReadyLabel'] == 1]
        not_ready_group = features[features['ReadyLabel'] == 0]

        avg_click_ready = ready_group['AvgClickRate'].mean()
        avg_click_not_ready = not_ready_group['AvgClickRate'].mean()

        print(f"Average Phishing Click Rate (Ready): {avg_click_ready:.3f}")
        print(f"Average Phishing Click Rate (Not Ready): {avg_click_not_ready:.3f}")

        print("\nInterpretation:")
        print("Users classified as 'Ready' (Avg Awareness Score ≥ 75) exhibit lower phishing click rates, indicating improved readiness for CMMC 2.0 compliance.")

# Attach handler to uploader widgets
historical_uploader.observe(on_upload_change, names='value')
quiz_uploader.observe(on_upload_change, names='value')
interaction_uploader.observe(on_upload_change, names='value')