In [None]:
!unzip /content/Dataset_Organized.zip -d '/content/dataset'

## Loading and Processing JSON Data:

In [None]:
import json
import os

# Function to find all JSON files in the directory and subdirectories
def find_json_files(directory):
    json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))
    return json_files

# Function to process JSON data with robust handling
def process_json_data_v3(json_data):
    processed_data = []
    for key, metrics in json_data.items():
        if isinstance(metrics, dict):
            record = {'id': key}
            for metric, values in metrics.items():
                if isinstance(values, dict) and 'attack' in values and 'data_point' in values['attack']:
                    record[metric] = values['attack']['data_point']
                else:
                    record[metric] = values
            processed_data.append(record)
    return processed_data

# Function to aggregate data from JSON files in a category
def aggregate_data_for_category_v3(category_path):
    aggregated_data = []
    json_files = find_json_files(category_path)
    for json_file in json_files:
        with open(json_file, 'r') as file:
            json_data = json.load(file)
            processed_data = process_json_data_v3(json_data)
            aggregated_data.extend(processed_data)
    return aggregated_data


## Aggregating Data from All Categories:

In [None]:
categories = ['Correct_ID', 'Wrong_CS_TS', 'Wrong_EV_TS', 'Wrong_ID']
extracted_folder_path = '/content/dataset/'

aggregated_data_v3 = {}
for category in categories:
    category_path = os.path.join(extracted_folder_path, category)
    aggregated_data_v3[category] = aggregate_data_for_category_v3(category_path)

# Count of records in each category
aggregated_data_counts_v3 = {category: len(data) for category, data in aggregated_data_v3.items()}


## Metrics Evaluation
First comparing the average values of the key metrics across different scenarios. I will calculate mean for metrics like branch, cycles, and instructions for each category. These metrics are stored as lists within each record so I will compute the average of these lists for each record first.

In [None]:
import pandas as pd

# Function to convert processed data to DataFrame and perform cleaning/preprocessing
def clean_and_preprocess_data(data):
    # Converting to DataFrame
    df = pd.DataFrame(data)

    # Handling missing values - Assuming a simple fill strategy for demonstration
    # In real scenarios, more sophisticated methods might be required
    df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalizing data - Example normalization (adjust according to actual data needs)
    # Normalization can be more sophisticated based on data distribution and requirements
    for col in df.columns:
        if col != 'id' and df[col].dtype != object:
            df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df

# Cleaning and preprocessing data for each category
cleaned_data = {category: clean_and_preprocess_data(aggregated_data_v3[category]) for category in aggregated_data_v3}

# Displaying the first few rows of the cleaned data for one of the categories as an example
cleaned_data['Correct_ID'].head()


Unnamed: 0,id,branch,cycles,instructions,time_diff
0,GS_1,"[0.18914202765121696, 0.18895023841236672, 0.1...","[0.003618175092936803, 0.009655569105691058, 0...","[1.0664361698903235, 1.1559035594349474, 1.041...","[0.021532, 0.040581, 0.059613, 0.07859, 0.0391..."
1,2-39-89-25,"[0.1902552204176334, 0.1816849027192578, 0.175...","[5.644440124416796e-05, 0.036667488657493495, ...","[0.23453762700189426, 1.1734322998425226, 1.37...","[0.021532, 0.040581, 0.059613, 0.07859, 0.0391..."
2,2-39-131-30,"[0.19368116857097759, 0.19028440594742213, 0.1...","[0.00010747541603630862, 4.642446573323508e-05...","[0.2898210546689423, 0.28771548303120736, 1.57...","[0.021532, 0.040581, 0.059613, 0.07859, 0.0391..."
3,2-39-89-25,"[0.19368116857097759, 0.19028440594742213, 0.1...","[0.00010747541603630862, 4.642446573323508e-05...","[0.2898210546689423, 0.28771548303120736, 1.57...","[0.021532, 0.040581, 0.059613, 0.07859, 0.0391..."
4,2-39-131-30,"[0.19368116857097759, 0.19028440594742213, 0.1...","[0.00010747541603630862, 4.642446573323508e-05...","[0.2898210546689423, 0.28771548303120736, 1.57...","[0.048563, 0.065969, 0.064915, 0.043744, 0.092..."


In [6]:
import numpy as np

def calculate_mean_of_list_data(df, columns):
    mean_values = {}
    for col in columns:
        valid_data = df[col].apply(lambda x: np.mean(x) if isinstance(x, list) and all(isinstance(i, (int, float)) for i in x) else np.nan)
        mean_values[col] = valid_data.mean()
    return mean_values

# Columns to calculate means for
metrics_columns = ['branch', 'cycles', 'instructions']

# Calculating means for each category
mean_metrics = {category: calculate_mean_of_list_data(cleaned_data[category], metrics_columns) for category in categories}
mean_metrics


Correct_ID: {'branch': 0.363, 'cycles': 0.00898, 'instructions': 1.197}
Wrong_CS_TS: {'branch': 42.886, 'cycles': 60.541, 'instructions': 44.605}
Wrong_EV_TS: {'branch': 0.322, 'cycles': 0.00952, 'instructions': 1.353}
Wrong_ID: {'branch': 43.76, 'cycles': 57.17, 'instructions': 44.224}


The mean values of key metrics for each category are as follows:

**Correct_ID (Normal Operation):**

Branch: 0.363
Cycles: 0.00898
Instructions: 1.197

**Wrong_CS_TS (Incorrect Charging Station Timestamp):**

Branch: 42.886
Cycles: 60.541
Instructions: 44.605

**Wrong_EV_TS (Incorrect EV Timestamp):**

Branch: 0.322
Cycles: 0.00952
Instructions: 1.353

**Wrong_ID (Incorrect EV Identification):**

Branch: 43.760
Cycles: 57.170
Instructions: 44.224

Observations:
**bold text**
Intrusion Detection Rates: There's a significant difference in the mean values of metrics between the normal operation (Correct_ID, Wrong_EV_TS) and attack scenarios (Wrong_CS_TS, Wrong_ID). This means it shows a good potential for detecting intrusions based on these metrics.

Overall System Performance: The metrics in attack scenarios are notably higher, indicating a significant impact on system performance during attacks.

In [10]:
from scipy.stats import f_oneway
import numpy as np

def prepare_data_for_anova(df, column):
    return df[column].apply(lambda x: np.mean(x) if isinstance(x, list) and all(isinstance(i, (int, float)) for i in x) else np.nan).dropna()

# Preparing data for ANOVA for each metric
anova_data = {}
for metric in metrics_columns:
    anova_data[metric] = [prepare_data_for_anova(cleaned_data[category], metric) for category in categories]

# Performing ANOVA for each metric
anova_results = {metric: f_oneway(*data) for metric, data in anova_data.items()}
anova_results  # Displaying the ANOVA test results

{'branch': F_onewayResult(statistic=15.71, pvalue=0.00000000195),
 'cycles': F_onewayResult(statistic=16.09, pvalue=0.00000000122),
 'instructions': F_onewayResult(statistic=15.36, pvalue=0.00000000303)}


The ANOVA test results for each metric are as follows:

**Branch:**

F-statistic: 15.71
p-value: ~1.95e-09

**Cycles:**

F-statistic: 16.09
p-value: ~1.22e-09

**Instructions:**

F-statistic: 15.36
p-value: ~3.03e-09

**Interpretation of Results:**
Intrusion Detection Rates and Overall System Performance: All three metrics (branch, cycles, instructions) show statistically significant differences across the categories (normal operation and different attack scenarios). This suggests that these metrics are effective indicators for detecting intrusions and assessing the impact on system performance.

The very low p-values in the ANOVA tests (significantly below 0.05) indicate that the differences in mean values of these metrics between the categories are unlikely to have occurred by chance.

**Conclusion:**
The dataset provides strong evidence that these metrics can be used to detect intrusion attempts (DDoS attacks) and to evaluate the overall performance of the system under different scenarios.
These insights could be crucial for enhancing data security in cloud databases using AI, as they highlight specific metrics that are sensitive to intrusion attempts.

## Let's check for time-related data in our dataset and, if available, perform an analysis on response rates.

In [None]:
# Checking if 'time_diff' column is present and suitable for response rate analysis
if 'time_diff' in cleaned_data['Correct_ID'].columns:
    # Example of 'time_diff' data from 'Correct_ID' category
    time_diff_example = cleaned_data['Correct_ID']['time_diff'].iloc[0]
    time_diff_analysis_suitable = isinstance(time_diff_example, list) and all(isinstance(x, (int, float)) for x in time_diff_example)
else:
    time_diff_analysis_suitable = False

time_diff_analysis_suitable, time_diff_example[:10]  # Display the suitability and a sample of the data


(True,
 [0.021532,
  0.040581,
  0.059613,
  0.07859,
  0.039182,
  0.049951,
  0.081252,
  0.065143,
  0.106318,
  0.062578])

## Analysis of Response Rates:

In [None]:
# Calculating average response times for each category
average_response_times = {}
for category in categories:
    # Extracting 'time_diff' data and calculating the average response time
    time_diff_data = cleaned_data[category]['time_diff'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
    average_response_times[category] = time_diff_data.mean()

average_response_times


{'Correct_ID': 0.11687066029457248,
 'Wrong_CS_TS': 0.10501327287081026,
 'Wrong_EV_TS': 0.1112844454886869,
 'Wrong_ID': 0.10346213544130878}

The average response times (presumably in seconds, but the unit depends on the dataset specifics) for each category are as follows:

Correct_ID (Normal Operation): 0.116 seconds

Wrong_CS_TS (Incorrect Charging Station Timestamp): 0.105 seconds

Wrong_EV_TS (Incorrect EV Timestamp): 0.111 seconds

Wrong_ID (Incorrect EV Identification): 0.103 seconds

**Interpretation of Response Rates:**

Response Efficiency: The response times are relatively low across all categories, indicating efficient system responsiveness.

Comparison Between Normal and Attack Scenarios: The response times in attack scenarios (Wrong_CS_TS, Wrong_EV_TS, Wrong_ID) are slightly lower on average than in normal operation (Correct_ID). This implies the system responds more quickly during attack scenarios which might be a result of specific intrusion detection mechanisms kicking in.