# 🛡️ Cybersecurity Dataset Overview & Analysis

## 📊 Comprehensive Data Analysis for Data Analyst Internship

This notebook provides a thorough overview and analysis of the CloudWatch Traffic Web Attack dataset:
- **Dataset Profiling** - Shape, structure, and data quality assessment
- **Statistical Analysis** - Descriptive statistics and distributions
- **Data Visualization** - Charts and graphs for better understanding
- **Data Quality Assessment** - Missing values, duplicates, and anomalies
- **Feature Analysis** - Individual column analysis and insights

**Objective:** Understanding the cybersecurity dataset to identify threats and patterns

In [None]:
# 📦 Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('🚀 Libraries imported successfully!')

✅ Dataset Shape: (282, 16)

✅ Column Names:
['bytes_in', 'bytes_out', 'creation_time', 'end_time', 'src_ip', 'src_ip_country_code', 'protocol', 'response.code', 'dst_port', 'dst_ip', 'rule_names', 'observation_name', 'source.meta', 'source.name', 'time', 'detection_types']

✅ First 5 Rows:
   bytes_in  bytes_out         creation_time              end_time  \
0      5602      12990  2024-04-25T23:00:00Z  2024-04-25T23:10:00Z   
1     30912      18186  2024-04-25T23:00:00Z  2024-04-25T23:10:00Z   
2     28506      13468  2024-04-25T23:00:00Z  2024-04-25T23:10:00Z   
3     30546      14278  2024-04-25T23:00:00Z  2024-04-25T23:10:00Z   
4      6526      13892  2024-04-25T23:00:00Z  2024-04-25T23:10:00Z   

            src_ip src_ip_country_code protocol  response.code  dst_port  \
0   147.161.161.82                  AE    HTTPS            200       443   
1     165.225.33.6                  US    HTTPS            200       443   
2  165.225.212.255                  CA    HTTPS            2

## 📂 1. Dataset Loading & Basic Information

In [None]:
# Load the dataset
print('📂 Loading CloudWatch Traffic Web Attack Dataset...')
df = pd.read_csv("../data/CloudWatch_Traffic_Web_Attack.csv")

# Display dataset shape
print(f'✅ Dataset Shape: {df.shape}')
print(f'📊 Rows: {df.shape[0]:,} | Columns: {df.shape[1]}')
print(f'💾 Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')

# Show column names
print(f'\n📝 Column Names ({len(df.columns)} total):')
for i, col in enumerate(df.columns, 1):
    print(f'  {i:2d}. {col}')

In [None]:
# View first 5 rows
print('📋 First 5 Rows:')
display(df.head())

# View last 5 rows
print('\n📋 Last 5 Rows:')
display(df.tail())

## 🔍 2. Data Types & Quality Assessment

In [None]:
# Check data types and nulls
print('🔍 DATASET INFO:')
print('='*40)
df.info()

# Data types summary
print('\n📊 Data Types Summary:')
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    print(f'  {dtype}: {count} columns')

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f'\n📈 Numeric Columns ({len(numeric_cols)}): {numeric_cols}')
print(f'🏷️ Categorical Columns ({len(categorical_cols)}): {categorical_cols}')

In [None]:
# Check for missing values
print('🔍 MISSING VALUES ANALYSIS:')
print('='*35)

missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing %': missing_percent.values
})

# Only show columns with missing values
missing_df = missing_df[missing_df['Missing Count'] > 0]

if len(missing_df) > 0:
    print('Columns with missing values:')
    display(missing_df)
else:
    print('✅ No missing values found!')

# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f'\n🔄 Duplicate Rows: {duplicate_count:,} ({duplicate_count/len(df)*100:.2f}%)')

## 📈 3. Statistical Analysis

In [None]:
# Descriptive statistics for numeric columns
print('📈 DESCRIPTIVE STATISTICS:')
print('='*35)

if len(numeric_cols) > 0:
    stats = df[numeric_cols].describe()
    display(stats)
    
    # Additional statistics
    print('\n📊 Additional Statistics:')
    for col in numeric_cols:
        print(f'\n{col}:')
        print(f'  • Variance: {df[col].var():.2f}')
        print(f'  • Skewness: {df[col].skew():.2f}')
        print(f'  • Kurtosis: {df[col].kurtosis():.2f}')
        print(f'  • Range: {df[col].max() - df[col].min():.2f}')
        print(f'  • IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}')
else:
    print('No numeric columns found')

In [None]:
# Categorical data analysis
print('🏷️ CATEGORICAL DATA ANALYSIS:')
print('='*35)

for col in categorical_cols:
    print(f'\n📊 {col}:')
    print(f'  • Unique values: {df[col].nunique()}')
    print(f'  • Most frequent: {df[col].mode().iloc[0] if not df[col].mode().empty else "N/A"}')
    
    # Show top 10 values
    top_values = df[col].value_counts().head(10)
    print(f'  • Top values:')
    for val, count in top_values.items():
        percentage = (count / len(df)) * 100
        print(f'    - {val}: {count:,} ({percentage:.1f}%)')

## 📊 4. Data Visualizations

In [None]:
# Create visualizations for numeric columns
if len(numeric_cols) > 0:
    # Distribution plots
    n_cols = min(4, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    fig.suptitle('📊 Distribution of Numeric Features', fontsize=16, fontweight='bold')
    
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, col in enumerate(numeric_cols):
        if i < len(axes):
            axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, edgecolor='black')
            axes[i].set_title(f'{col} Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(numeric_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Box plots for outlier detection
if len(numeric_cols) > 0:
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    fig.suptitle('📦 Box Plots for Outlier Detection', fontsize=16, fontweight='bold')
    
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, col in enumerate(numeric_cols):
        if i < len(axes):
            data = df[col].dropna()
            axes[i].boxplot(data)
            axes[i].set_title(f'{col} Box Plot')
            axes[i].set_ylabel(col)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(numeric_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical data visualizations
for col in categorical_cols[:3]:  # Limit to first 3 categorical columns
    plt.figure(figsize=(12, 6))
    
    # Get top 15 values to avoid overcrowding
    top_values = df[col].value_counts().head(15)
    
    plt.subplot(1, 2, 1)
    top_values.plot(kind='bar')
    plt.title(f'{col} - Top 15 Values (Bar Chart)')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    top_values.plot(kind='pie', autopct='%1.1f%%')
    plt.title(f'{col} - Distribution (Pie Chart)')
    plt.ylabel('')
    
    plt.tight_layout()
    plt.show()

## 🔗 5. Correlation Analysis

In [None]:
# Correlation analysis for numeric columns
if len(numeric_cols) > 1:
    print('🔗 CORRELATION ANALYSIS:')
    print('='*30)
    
    # Calculate correlation matrix
    corr_matrix = df[numeric_cols].corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.2f', cbar_kws={'shrink': .8})
    plt.title('🔗 Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    
    if high_corr_pairs:
        print('\n🔴 Highly Correlated Features (|correlation| > 0.7):')
        for feat1, feat2, corr in high_corr_pairs:
            print(f'  • {feat1} ↔ {feat2}: {corr:.3f}')
    else:
        print('\n✅ No highly correlated features found')
else:
    print('Need at least 2 numeric columns for correlation analysis')

## ⚠️ 6. Data Quality Issues & Recommendations

In [None]:
# Data quality assessment
print('⚠️ DATA QUALITY ASSESSMENT:')
print('='*35)

issues = []
recommendations = []

# Check for missing values
if df.isnull().sum().sum() > 0:
    issues.append('Missing values detected')
    recommendations.append('Handle missing values through imputation or removal')

# Check for duplicates
if df.duplicated().sum() > 0:
    issues.append(f'{df.duplicated().sum()} duplicate rows found')
    recommendations.append('Remove duplicate rows to avoid bias')

# Check for potential outliers in numeric columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)][col]
    if len(outliers) > 0:
        issues.append(f'{len(outliers)} potential outliers in {col}')

if len(outliers) > 0:
    recommendations.append('Investigate and handle outliers appropriately')

# Check for high cardinality categorical columns
for col in categorical_cols:
    unique_ratio = df[col].nunique() / len(df)
    if unique_ratio > 0.9:
        issues.append(f'High cardinality in {col} ({df[col].nunique()} unique values)')
        recommendations.append(f'Consider feature engineering for {col}')

# Display issues and recommendations
if issues:
    print('🔴 Issues Found:')
    for i, issue in enumerate(issues, 1):
        print(f'  {i}. {issue}')
    
    print('\n💡 Recommendations:')
    for i, rec in enumerate(recommendations, 1):
        print(f'  {i}. {rec}')
else:
    print('✅ No major data quality issues found!')

## 📋 7. Summary Report

In [None]:
# Generate comprehensive summary report
print('📋 DATASET SUMMARY REPORT')
print('='*40)

summary_stats = {
    'Total Records': f"{len(df):,}",
    'Total Features': len(df.columns),
    'Numeric Features': len(numeric_cols),
    'Categorical Features': len(categorical_cols),
    'Memory Usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
    'Missing Values': df.isnull().sum().sum(),
    'Duplicate Rows': df.duplicated().sum(),
    'Data Quality Score': f"{max(0, 100 - len(issues)*10)}/100"
}

for key, value in summary_stats.items():
    print(f"• {key}: {value}")

# Feature importance insights
if len(numeric_cols) > 0:
    print('\n🎯 Key Insights:')
    
    # Find columns with highest variance (potential importance)
    variances = df[numeric_cols].var().sort_values(ascending=False)
    print(f'• Highest variance feature: {variances.index[0]} ({variances.iloc[0]:.2f})')
    
    # Find most common categorical values
    for col in categorical_cols[:2]:  # Show first 2 categorical columns
        mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else 'N/A'
        mode_count = (df[col] == mode_val).sum()
        print(f'• Most common {col}: {mode_val} ({mode_count:,} occurrences)')

print('\n✅ Dataset overview analysis completed!')
print('🚀 Ready for advanced analysis and machine learning!')