<a href="https://colab.research.google.com/github/Kdavis2025/Automating-Compliance-AI-and-Machine-Learning-Approaches-to-Achieviing-CMMC-2.0-Certification/blob/main/Cyber_Security_Awareness_and_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- Upload Widgets ---
historical_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload historical_metrics.csv'
)

quiz_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload quiz_results.csv'
)

interaction_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload interaction_logs.csv'
)

display(historical_uploader, quiz_uploader, interaction_uploader)

# --- Load Data Function ---
def load_uploaded_csv(uploader):
    if uploader.value:
        name = list(uploader.value.keys())[0]
        content = uploader.value[name]['content']
        return pd.read_csv(pd.io.common.BytesIO(content))
    else:
        return None

# Wait for uploads, then process
def on_upload_change(change):
    # Ensure all three files are uploaded
    if historical_uploader.value and quiz_uploader.value and interaction_uploader.value:
        # Clear the output to display results
        clear_output()

        # Load dataframes
        historical_df = load_uploaded_csv(historical_uploader)
        quiz_df = load_uploaded_csv(quiz_uploader)
        interaction_df = load_uploaded_csv(interaction_uploader)

        # Display head of each for confirmation
        print("Historical Metrics (first 5 rows):")
        display(historical_df.head())

        print("Quiz Results (first 5 rows):")
        display(quiz_df.head())

        print("Interaction Logs (first 5 rows):")
        display(interaction_df.head())

        # --- Feature Engineering per User ---
        # 1. From historical metrics: use average AwarenessScore and PhishingClickRate
        user_hist = historical_df.groupby('UserID').agg({
            'AwarenessScore': 'mean',
            'PhishingClickRate': 'mean',
            'ModulesCompleted': 'sum',
            'AvgTimePerModule': 'mean'
        }).rename(columns={
            'AwarenessScore': 'AvgAwarenessScore',
            'PhishingClickRate': 'AvgClickRate',
            'ModulesCompleted': 'TotalModules',
            'AvgTimePerModule': 'MeanModuleTime'
        })

        # 2. From quiz results: compute accuracy rate and avg response time per user
        quiz_df['IsCorrect'] = quiz_df['IsCorrect'].astype(int)
        user_quiz = quiz_df.groupby('UserID').agg({
            'IsCorrect': 'mean',
            'ResponseTimeSec': 'mean'
        }).rename(columns={
            'IsCorrect': 'QuizAccuracy',
            'ResponseTimeSec': 'MeanResponseTime'
        })

        # 3. From interaction logs: count phishing simulations, hint usage, total interactions
        interaction_df['PhishingAttempt'] = interaction_df['InteractionType'].apply(lambda x: 1 if x == 'phishing_sim' else 0)
        interaction_df['HintUsed'] = interaction_df['HintRequested'].astype(int)
        user_inter = interaction_df.groupby('UserID').agg({
            'PhishingAttempt': 'sum',
            'HintUsed': 'sum',
            'InteractionType': 'count'
        }).rename(columns={
            'PhishingAttempt': 'TotalPhishingSims',
            'HintUsed': 'TotalHints',
            'InteractionType': 'TotalInteractions'
        })

        # Merge features
        features = user_hist.join(user_quiz, how='left').join(user_inter, how='left').fillna(0)

        # Create readiness label: AvgAwarenessScore >= 75 => Ready (1), else Not Ready (0)
        features['ReadyLabel'] = features['AvgAwarenessScore'].apply(lambda x: 1 if x >= 75 else 0)

        # Display engineered feature table
        print("Engineered Features per User:")
        display(features)

        # --- Model Training ---
        X = features.drop(columns=['ReadyLabel'])
        y = features['ReadyLabel']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # --- Evaluation ---
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)

        print("Model Performance on Test Set:")
        print(f"Accuracy: {acc:.2f}")
        print(f"Precision: {prec:.2f}")
        print(f"Recall: {rec:.2f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=0))

        # --- Readiness Effectiveness Analysis ---
        # Compare average phishing click rate for Ready vs Not Ready
        ready_group = features[features['ReadyLabel'] == 1]
        not_ready_group = features[features['ReadyLabel'] == 0]

        avg_click_ready = ready_group['AvgClickRate'].mean()
        avg_click_not_ready = not_ready_group['AvgClickRate'].mean()

        print(f"Average Phishing Click Rate (Ready): {avg_click_ready:.3f}")
        print(f"Average Phishing Click Rate (Not Ready): {avg_click_not_ready:.3f}")

        print("\nInterpretation:")
        print("Users classified as 'Ready' (Avg Awareness Score ≥ 75) exhibit lower phishing click rates, indicating improved readiness for CMMC 2.0 compliance.")

# Attach handler to uploader widgets
historical_uploader.observe(on_upload_change, names='value')
quiz_uploader.observe(on_upload_change, names='value')
interaction_uploader.observe(on_upload_change, names='value')

Historical Metrics (first 5 rows):


Unnamed: 0,Date,UserID,ModuleID,AwarenessScore,PhishingClickRate,ModulesCompleted,AvgTimePerModule
0,2024-01-07,4,M2,61.4,0.126,1,14.3
1,2024-01-14,12,M4,56.9,0.021,3,19.2
2,2024-01-21,1,M2,54.5,0.136,1,6.0
3,2024-01-28,6,M4,52.7,0.254,5,22.8
4,2024-02-04,7,M6,63.1,0.155,3,11.4


Quiz Results (first 5 rows):


Unnamed: 0,Timestamp,UserID,QuizID,QuestionID,DifficultyLevel,UserAnswer,CorrectAnswer,IsCorrect,ResponseTimeSec
0,2025-01-24 12:00:00,17,Quiz4,Q1,1,B,C,False,28.0
1,2025-01-03 16:00:00,11,Quiz3,Q6,3,B,C,False,13.7
2,2025-02-21 17:00:00,4,Quiz1,Q10,2,D,A,False,33.6
3,2025-03-21 09:00:00,1,Quiz4,Q8,2,D,D,True,24.0
4,2025-05-08 23:00:00,7,Quiz1,Q2,1,B,A,False,22.7


Interaction Logs (first 5 rows):


Unnamed: 0,Timestamp,UserID,ModuleID,InteractionType,ResponseTimeSec,HintRequested
0,2025-04-21 06:00:00,16,M2,complete_module,60.5,False
1,2025-05-08 11:00:00,19,M1,start_module,26.9,True
2,2025-01-04 16:00:00,15,M8,complete_module,6.9,True
3,2025-03-06 12:00:00,8,M5,skip_question,57.6,False
4,2025-05-01 01:00:00,8,M7,phishing_sim,78.8,True


Engineered Features per User:


Unnamed: 0_level_0,AvgAwarenessScore,AvgClickRate,TotalModules,MeanModuleTime,QuizAccuracy,MeanResponseTime,TotalPhishingSims,TotalHints,TotalInteractions,ReadyLabel
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,62.616667,0.1025,18,17.333333,0.444444,22.744444,1,1,5,0
2,63.825,0.1225,29,15.3,0.285714,31.157143,2,4,11,0
3,57.833333,0.139,24,19.1,0.125,31.375,0,2,13,0
4,55.975,0.123,24,18.825,0.2,34.61,1,1,10,0
5,55.366667,0.131667,7,14.266667,0.428571,29.071429,3,2,9,0
6,58.04,0.1524,15,21.12,0.214286,38.321429,1,2,8,0
7,59.244444,0.130667,21,16.922222,0.2,33.29,1,1,8,0
8,61.45,0.1265,13,17.9,0.416667,32.875,3,6,21,0
9,69.266667,0.152,5,19.933333,0.111111,38.544444,0,4,8,0
10,52.083333,0.1665,17,18.233333,0.222222,39.677778,1,1,10,0


Model Performance on Test Set:
Accuracy: 1.00
Precision: 0.00
Recall: 0.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

Average Phishing Click Rate (Ready): nan
Average Phishing Click Rate (Not Ready): 0.138

Interpretation:
Users classified as 'Ready' (Avg Awareness Score ≥ 75) exhibit lower phishing click rates, indicating improved readiness for CMMC 2.0 compliance.
