# Statistical Analysis - Cybersecurity Attacks Dataset

## Overview
This notebook provides comprehensive statistical analysis including descriptive statistics, inferential statistics, and exploratory statistical analysis.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, mannwhitneyu, normaltest
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)


## 1. Data Loading and Preparation


In [None]:
# Load and prepare data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('../../data/Cybersecurity_attacks.csv')
df.columns = df.columns.str.strip()

if '.' in df.columns:
    df = df.drop(columns=['.'])

# Parse Time column
if 'Time' in df.columns:
    def parse_time(time_str):
        if pd.isna(time_str):
            return None, None
        try:
            if '-' in str(time_str):
                start, end = str(time_str).split('-')
                return int(start), int(end)
            else:
                return int(time_str), int(time_str)
        except:
            return None, None
    
    time_parsed = df['Time'].apply(parse_time)
    df['Time_Start'] = [t[0] for t in time_parsed]
    df['Time_End'] = [t[1] for t in time_parsed]
    df['Time_Duration'] = df['Time_End'] - df['Time_Start']
    df['Datetime_Start'] = pd.to_datetime(df['Time_Start'], unit='s', errors='coerce')
    df['Hour'] = df['Datetime_Start'].dt.hour
    df['DayOfWeek'] = df['Datetime_Start'].dt.day_name()
    df['Month'] = df['Datetime_Start'].dt.month

print(f"Dataset Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


## 2. Descriptive Statistics


In [None]:
# Descriptive Statistics for Numerical Variables
numerical_cols = ['Source Port', 'Destination Port', 'Time_Duration', 'Hour']
numerical_cols = [col for col in numerical_cols if col in df.columns]

print("=" * 60)
print("DESCRIPTIVE STATISTICS")
print("=" * 60)

for col in numerical_cols:
    print(f"\n{col}:")
    print(df[col].describe())
    print(f"Skewness: {df[col].skew():.4f}")
    print(f"Kurtosis: {df[col].kurtosis():.4f}")


## 3. Hypothesis Testing


In [None]:
# Chi-square test for independence
if 'Attack category' in df.columns and 'Protocol' in df.columns:
    top_categories = df['Attack category'].value_counts().head(5).index
    top_protocols = df['Protocol'].value_counts().head(5).index
    contingency_table = pd.crosstab(
        df[df['Attack category'].isin(top_categories)]['Attack category'],
        df[df['Protocol'].isin(top_protocols)]['Protocol']
    )
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square: {chi2:.4f}")
    print(f"p-value: {p_value:.6f}")
    print(f"Conclusion: {'Dependent' if p_value < 0.05 else 'Independent'}")


In [None]:
# ANOVA test
if 'Attack category' in df.columns and 'Destination Port' in df.columns:
    top_categories = df['Attack category'].value_counts().head(5).index.tolist()
    filtered_df = df[df['Attack category'].isin(top_categories)]
    groups = [filtered_df[filtered_df['Attack category'] == cat]['Destination Port'].dropna().values 
              for cat in top_categories]
    f_stat, p_value = f_oneway(*groups)
    print(f"F-statistic: {f_stat:.4f}")
    print(f"p-value: {p_value:.6f}")
    print(f"Conclusion: {'Means differ' if p_value < 0.05 else 'Means are similar'}")


## 4. Correlation Analysis


In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../../visualizations/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Correlation Matrix:")
print(correlation_matrix.round(4))
