In [13]:
#Initial functions, loading data and importing libararies
import random
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import seaborn as sns
from prettytable import PrettyTable
from scipy.stats import gaussian_kde

# Read data
df = pd.read_csv('Train_data.csv')

# Beautifying the outputs
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase the display width
pd.set_option('display.float_format', '{:.2f}'.format)  # Display floats with 2 decimal places

# Get numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [15]:
# Small helper function to make tables for the outputs
def maketable(data):
    table = PrettyTable()
    for col, val in data.items():
        table.add_row([col, val])
    return table

In [16]:
# Main plots functions, pmf cdf pdf just copys of each other 
def plot_pmf(df, column):
    pmf = df[column].value_counts(normalize=True)
    plt.figure(figsize=(15, 4))
    pmf.plot(kind='bar')
    plt.title(f'PMF of {column}')
    plt.xlabel(column)
    plt.ylabel('Probability')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_pdf(df, column):
    plt.figure(figsize=(8, 4))
    sns.kdeplot(df[column], bw_adjust=0.5, warn_singular=False)
    plt.title(f'PDF of {column}')
    plt.xlabel(column)
    plt.ylabel('Density')
    plt.tight_layout()
    plt.show()

def plot_cdf(df, column):
    data = df[column].dropna().sort_values()
    cdf = np.arange(1, len(data) + 1) / len(data)
    plt.figure(figsize=(8, 4))
    plt.plot(data, cdf, linestyle='-', marker='')
    plt.title(f'Cumulative Distribution Function (CDF) of {column}')
    plt.xlabel(column)
    plt.ylabel('Cumulative Probability')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [17]:
# The scatter plot function + the joint distribution function plotting
def plotscatter(numericalcols):
    number1 = random.randint(1, len(numericalcols) - 1)
    number2 = random.randint(1, len(numericalcols) - 1)
    x_field = numericalcols[number1]
    y_field = numericalcols[number2]
    plt.figure(figsize=(10, 6))
    plt.scatter(df[x_field], df[y_field], alpha=0.5, color='blue')
    plt.title(f'Scatter Plot between {x_field} and {y_field}')
    plt.xlabel(x_field)
    plt.ylabel(y_field)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def analyze_and_plot_joint_distribution(df, numericalcols, categoricalcols):
    def plot_joint_pmf(x_field, y_field):
        joint_counts = pd.crosstab(df[x_field], df[y_field])
        joint_pmf = joint_counts / joint_counts.sum().sum()
        plt.figure(figsize=(10, 6))
        sns.heatmap(joint_pmf, annot=True, cmap='Blues')
        plt.title(f'Joint PMF of {x_field} and {y_field}')
        plt.xlabel(y_field)
        plt.ylabel(x_field)
        plt.show()

    def plot_joint_pdf(x_field, y_field):
        filtered_df = df[[x_field, y_field]].drop_duplicates().dropna()
        if filtered_df[x_field].nunique() < 2 or filtered_df[y_field].nunique() < 2:
            print(f"Skipping {x_field} and {y_field} due to insufficient unique values.")
            return
        plt.figure(figsize=(10, 6))
        sns.kdeplot(x=filtered_df[x_field], y=filtered_df[y_field], fill=True, cmap='Blues', thresh=0, bw_adjust=0.5)
        plt.title(f'Joint PDF of {x_field} and {y_field}')
        plt.xlabel(x_field)
        plt.ylabel(y_field)
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    if len(numericalcols) > 1:
        number1, number2 = random.sample(range(len(numericalcols)), 2)
        x_field = numericalcols[number1]
        y_field = numericalcols[number2]
        plot_joint_pdf(x_field, y_field)
    
    if len(categoricalcols) > 1:
        number1, number2 = random.sample(range(len(categoricalcols)), 2)
        x_field = categoricalcols[number1]
        y_field = categoricalcols[number2]
        plot_joint_pmf(x_field, y_field)

In [19]:
# Checking the data and plotting accordingly
def analyze_and_plot_distributions(df, class_column):
    for column in df.columns:
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            plot_pmf(df, column)
        elif np.issubdtype(df[column].dtype, np.number):
            plot_pdf(df.dropna(subset=[column]), column)
            plot_cdf(df, column)

In [21]:
# General overview of the data
# Listing columns and data types
columns = df.columns
print(columns, "\n")
data_heads = df.dtypes
print(data_heads, "\n")

# Check for missing values
missing_data = df.isnull().sum()
table = maketable(missing_data)
table.field_names = ["Column", "Missing values"]
print("Number of missing values per column:\n", table, "\n")

# Replace infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class'], dtype='object') 

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
lan

In [26]:
# Select only numeric columns using `select_dtypes`
numeric_df = df.select_dtypes(include=[np.number])

# Calculate each measure individually
max_values = numeric_df.max()
min_values = numeric_df.min()
mean_values = numeric_df.mean()
variance_values = numeric_df.var()

# Create tables for each statistic
table1 = maketable(max_values)
table2 = maketable(min_values)
table3 = maketable(mean_values)
table4 = maketable(variance_values)

table1.field_names = ["Column", "Max values"]
table2.field_names = ["Column", "Min values"]
table3.field_names = ["Column", "Mean values"]
table4.field_names = ["Column", "Variance values"]

print("Maximum values:\n", table1, "\n")
print("Minimum values:\n", table2, "\n")
print("Mean values:\n", table3, "\n")
print("Variance values:\n", table4, "\n")


Maximum values:
 +-----------------------------+-------------+
|            Column           |  Max values |
+-----------------------------+-------------+
|           duration          |   42862.0   |
|          src_bytes          | 381709090.0 |
|          dst_bytes          |  5151385.0  |
|             land            |     1.0     |
|        wrong_fragment       |     3.0     |
|            urgent           |     1.0     |
|             hot             |     77.0    |
|      num_failed_logins      |     4.0     |
|          logged_in          |     1.0     |
|       num_compromised       |    884.0    |
|          root_shell         |     1.0     |
|         su_attempted        |     2.0     |
|           num_root          |    975.0    |
|      num_file_creations     |     40.0    |
|          num_shells         |     1.0     |
|       num_access_files      |     8.0     |
|      num_outbound_cmds      |     0.0     |
|        is_host_login        |     0.0     |
|        is_guest