Title: Random Missing Value Injection into CSV Dataset

Algorithm Methodology:

Data Importing: Import necessary Python libraries such as pandas, numpy, random, and os.

Function Definition: Define the add_missing_values function which takes as input a CSV file, number of columns where missing values are to be added, the number of missing values to be added, and a list of column names to be excluded from the operation.

Data Loading: Load the CSV file into a pandas DataFrame.

Initial Missing Values Calculation: Calculate the initial number of missing values in the dataset by using the .isnull().sum() methods in pandas DataFrame. Print the initial number of missing values for each column.

Column Selection: From the list of all columns in the DataFrame, exclude the columns provided in the exclude_columns list. After that, randomly select a specified number of columns from the DataFrame where missing values are to be added.

Missing Values Addition: Calculate the number of missing values to be added in each selected column. If there are any remaining cells after even distribution, they are assigned to the first few columns. For each selected column, randomly select row indices and add missing values at these indices.

Final Missing Values Calculation: Calculate the final number of missing values in the dataset and print the number for each column.

Data Saving: Save the DataFrame with added missing values into a new CSV file. Print the name and location of the new file.

Function Call: Call the add_missing_values function providing the path of the CSV file, number of columns, number of missing cells to be added, and a list of columns to be excluded.

The algorithm provides a way to intentionally introduce missing values into a dataset. This might be useful for testing and comparing different methods of handling missing data in data pre-processing or modeling tasks.






In [1]:
import pandas as pd
import numpy as np
import random
import os
import time

def add_missing_values(csv_file, num_columns, num_cells, exclude_columns):
    # Load the csv file into a pandas DataFrame
    start_time = time.time()
    df = pd.read_csv(csv_file)
    load_time = time.time() - start_time

    # Count the initial number of missing values
    initial_missing_values = df.isnull().sum()
    print(f"Initial number of missing values:\n{initial_missing_values}")

    # Get the list of all columns
    all_columns = df.columns.tolist()

    # Remove the excluded columns from the list of all columns
    for col in exclude_columns:
        if col in all_columns:
            all_columns.remove(col)

    # Check if the number of columns to be altered exceeds the total columns
    if num_columns > len(all_columns):
        print("The number of columns to be altered is more than the total number of columns in the file.")
        return

    # Randomly select 'num_columns' columns
    selected_columns = random.sample(all_columns, num_columns)

    # Calculate the number of missing values to be added in each column
    cells_per_column = num_cells // num_columns
    extra_cells = num_cells % num_columns

    # Iterate over each selected column
    step_times = []
    for i, col in enumerate(selected_columns):
        # Distribute the extra cells among the first few columns
        extra = 1 if i < extra_cells else 0

        # Randomly select 'cells_per_column + extra' row indices
        start_time = time.time()
        row_indices = random.sample(range(len(df[col])), cells_per_column + extra)
        random_sampling_time = time.time() - start_time

        # Add missing values at the selected indices
        start_time = time.time()
        df.loc[row_indices, col] = np.nan
        missing_values_addition_time = time.time() - start_time

        step_times.append({
            'Column': col,
            'Random Sampling Time': random_sampling_time,
            'Missing Values Addition Time': missing_values_addition_time
        })

    # Count the final number of missing values
    final_missing_values = df.isnull().sum()
    print(f"Final number of missing values:\n{final_missing_values}")

    # Save the DataFrame to a new csv file
    start_time = time.time()
    base_dir, file_name = os.path.split(csv_file)
    name, extension = os.path.splitext(file_name)
    new_file = os.path.join(base_dir, 'new_' + name + extension)
    df.to_csv(new_file, index=False)
    saving_time = time.time() - start_time

    print(f"Modified CSV saved as '{new_file}'")
    print("Execution times:")
    print(f"- Loading CSV: {load_time:.4f} seconds")
    for step_time in step_times:
        print(f"- Column: {step_time['Column']}")
        print(f"  - Random Sampling Time: {step_time['Random Sampling Time']:.4f} seconds")
        print(f"  - Missing Values Addition Time: {step_time['Missing Values Addition Time']:.4f} seconds")
    print(f"- Saving CSV: {saving_time:.4f} seconds")

# Use the function
dataset_file = "D:\\data\\Outlier_dataset\\a part of training and testing set\\UNSW_NB15_training-set.csv"
exclude_columns = ['label'] # replace these with the names of the columns to be excluded; you can leave it empty, but if you want to exclude columns mostly it should be the target
add_missing_values(dataset_file, 3, 100, exclude_columns)


Initial number of missing values:
id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
