# Processing notebook for xInfoDump data

In [None]:
import pandas as pd
import os
import re
import glob
import datetime
import shutil

## Settings

- `SOURCE_GLOB`: glob expression that will get all the data from your source.
- `TEST_ID_REGEX`: Regular Expression with exactly one capture group. This capture group should contain an integer, and will identify the test number across the dataset (in merged CSVs).
- `OUTPUT_DIR`: Desired directory to output the dataset to.
- `TEST_SCENARIO`: Name of the ns-O-RAN/ns-3 scenario file that was tested (for traceability).

In [None]:
SOURCE_GLOB = os.path.join("DataSource", "DumpedData*.csv")
TEST_ID_REGEX = "DumpedData_Test([0-9]+).csv"
OUTPUT_DIR = "Dataset-0"
TEST_SCENARIO = "scratch/scenario-zero.cc"

## Environment preparation

In [None]:
OG_FILES = glob.glob(SOURCE_GLOB)
print(f"Loaded {len(OG_FILES)} files")

In [None]:
os.makedirs(os.path.join(OUTPUT_DIR, "Raw", "Separated"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "Split", "Separated"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "Processed", "Separated"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "Time-Processed", "Separated"), exist_ok=True)

In [None]:
RENAMING_MAP = {
    "pm-Containers.pLMN-Identity": "PLMN ID",
    "list-of-matched-UEs.ueId": "UE ID",
    "cellObjectID": "Cell object ID",
    "timestamp": "Timestamp",
    "list-of-matched-UEs.pmType": "UE Performance Measurement type",
    "pm-Containers.dl-PRBUsage": "DL PRB usage",
    "pm-Containers.ul-PRBUsage": "UL PRB usage",
    "pm-Containers.dl-TotalofAvailablePRBs": "DL total available PRBs",
    "pm-Containers.ul-TotalofAvailablePRBs": "UL total available PRBs",
    "pm-Containers.nRCGI.nRCellIdentity": "NRCI",
    "list-of-matched-UEs.pmVal": "UE Performance Measurement value",
    "pm-Containers.QCI": "QCI",
    "pm-Containers.drbqci": "DRB QCI",
    "pm-Containers.pDCPBytesDL": "DL PCDP Bytes",
    "pm-Containers.pDCPBytesUL": "UL PDCP Bytes",
    "pm-Containers.pLMN-Identity": "PLMN ID",
    "pm-Containers.interface-type": "Interface type",
    "pm-Containers.numberOfActive-UEs": "Number of active UEs",
    "list-of-matched-UEs.rrcEvent": "RRC Event",
    "list-of-matched-UEs.measResultNeighCells.resultsSSB-Cell.sinr": "Neighbor cells SINR",
    "list-of-matched-UEs.measResultNeighCells.physCellId": "Neighbor cell physical cell ID"
}

EXTRA_DROP = ['pm-Containers.type',
 'pm-Containers.nRCGI.pLMN-Identity']

## Raw file copy and processing

In [None]:
merged_df = None
for filename in OG_FILES:
    test_id = int(re.findall(TEST_ID_REGEX, filename)[0])
    shutil.copy2(filename, os.path.join(OUTPUT_DIR, "Raw", "Separated", f"DumpedData_Test{test_id}.csv"))
    raw_df = pd.read_csv(filename).rename(columns={"Unnamed: 0": "Original index"})
    raw_df["Test ID"] = test_id
    if merged_df is None:
        merged_df = raw_df
    else:
        merged_df = pd.concat([merged_df, raw_df], ignore_index=True)
merged_df.to_csv(os.path.join(OUTPUT_DIR, "Raw", "DumpedData_Merged.csv"))

## Process DU data only

In [None]:
merged_du_df = None
merged_raw_du_df = None
for filename in OG_FILES:
    test_id = int(re.findall(TEST_ID_REGEX, filename)[0])
    raw_df = pd.read_csv(filename).rename(columns={"Unnamed: 0": "Original index"})
    du_df = raw_df.query("`pm-Containers.type` == 'oDU'").copy().drop(columns=EXTRA_DROP)
    du_df = du_df.drop(columns=du_df.columns[du_df.isna().all()].to_list()).reset_index(drop=True)
    du_df.to_csv(os.path.join(OUTPUT_DIR, "Split", "Separated", f"DUData_Test{test_id}.csv"))
    du_raw_df = du_df.copy()
    du_raw_df["Test ID"] = test_id
    if merged_raw_du_df is None:
        merged_raw_du_df = du_raw_df
    else:
        merged_raw_du_df = pd.concat([merged_raw_du_df, du_raw_df], ignore_index=True)
    du_df = du_df.drop(columns="Original index").rename(columns=RENAMING_MAP).sort_values("Timestamp").reset_index(drop=True)
    du_df.to_csv(os.path.join(OUTPUT_DIR, "Processed", "Separated", f"DUData_Test{test_id}.csv"))
    du_df["Test ID"] = test_id
    if merged_du_df is None:
        merged_du_df = du_df
    else:
        merged_du_df = pd.concat([merged_du_df, du_df], ignore_index=True)
    
merged_du_df.to_csv(os.path.join(OUTPUT_DIR, "Processed", "DUData_Merged.csv"))
merged_raw_du_df.to_csv(os.path.join(OUTPUT_DIR, "Split", "DUData_Merged.csv"))

## Process CU-UP data only

In [None]:
merged_cu_up_df = None
merged_raw_cu_up_df = None
for filename in OG_FILES:
    test_id = int(re.findall(TEST_ID_REGEX, filename)[0])
    raw_df = pd.read_csv(filename).rename(columns={"Unnamed: 0": "Original index"})
    cu_up_df = raw_df.query("`pm-Containers.type` == 'oCU-UP'").copy().drop(columns=EXTRA_DROP)
    cu_up_df = cu_up_df.drop(columns=cu_up_df.columns[cu_up_df.isna().all()].to_list()).reset_index(drop=True)
    cu_up_df.to_csv(os.path.join("Split", "Separated", f"CU-UPData_Test{test_id}.csv"))
    cu_up_raw_df = cu_up_df.copy()
    cu_up_raw_df["Test ID"] = test_id
    if merged_raw_cu_up_df is None:
        merged_raw_cu_up_df = cu_up_raw_df
    else:
        merged_raw_cu_up_df = pd.concat([merged_raw_cu_up_df, cu_up_raw_df], ignore_index=True)
    cu_up_df = cu_up_df.drop(columns="Original index").rename(columns=RENAMING_MAP).sort_values("Timestamp").reset_index(drop=True)
    cu_up_df.to_csv(os.path.join("Processed", "Separated", f"CU-UPData_Test{test_id}.csv"))
    cu_up_df["Test ID"] = test_id
    if merged_cu_up_df is None:
        merged_cu_up_df = cu_up_df
    else:
        merged_cu_up_df = pd.concat([merged_cu_up_df, cu_up_df], ignore_index=True)
    
merged_cu_up_df.to_csv(os.path.join("Processed", "CU-UPData_Merged.csv"))
merged_raw_cu_up_df.to_csv(os.path.join("Split", "CU-UPData_Merged.csv"))

## Process CU-CP data only

In [None]:
merged_cu_cp_df = None
merged_raw_cu_cp_df = None
for filename in OG_FILES:
    test_id = int(re.findall(TEST_ID_REGEX, filename)[0])
    raw_df = pd.read_csv(filename).rename(columns={"Unnamed: 0": "Original index"})
    cu_cp_df = raw_df.query("`pm-Containers.type` == 'oCU-CP'").copy().drop(columns=EXTRA_DROP)
    cu_cp_df = cu_cp_df.drop(columns=cu_cp_df.columns[cu_cp_df.isna().all()].to_list()).reset_index(drop=True)
    cu_cp_df.to_csv(os.path.join("Split", "Separated", f"CU-CPData_Test{test_id}.csv"))
    cu_cp_raw_df = cu_cp_df.copy()
    cu_cp_raw_df["Test ID"] = test_id
    if merged_raw_cu_cp_df is None:
        merged_raw_cu_cp_df = cu_cp_raw_df
    else:
        merged_raw_cu_cp_df = pd.concat([merged_raw_cu_cp_df, cu_cp_raw_df], ignore_index=True)
    cu_cp_df = cu_cp_df.drop(columns="Original index").rename(columns=RENAMING_MAP).sort_values("Timestamp").reset_index(drop=True)
    cu_cp_df.to_csv(os.path.join("Processed", "Separated", f"CU-CPData_Test{test_id}.csv"))
    cu_cp_df["Test ID"] = test_id
    if merged_cu_cp_df is None:
        merged_cu_cp_df = cu_cp_df
    else:
        merged_cu_cp_df = pd.concat([merged_cu_cp_df, cu_cp_df], ignore_index=True)
    
merged_cu_cp_df.to_csv(os.path.join("Processed", "CU-CPData_Merged.csv"))
merged_raw_cu_cp_df.to_csv(os.path.join("Split", "CU-CPData_Merged.csv"))

## Time-process all data

In [None]:
PROCESSED_SEPARATED_CSVs = glob.glob(os.path.join(OUTPUT_DIR, "Processed", "Separated", "*.csv"))
REPROCESS_REGEX = "Test([0-9]+).csv"

In [None]:
all_time_cu_cp_processed_df = None
all_time_cu_up_processed_df = None
all_time_du_processed_df = None
for filename in PROCESSED_SEPARATED_CSVs:
    bname = os.path.basename(filename)
    type_id = bname.split('_')[0]
    test_id = int(re.findall(REPROCESS_REGEX, filename)[0])
    data_df = pd.read_csv(filename).drop(columns=["Unnamed: 0"])
    first_ts = data_df["Timestamp"].min()
    data_df["Timestamp"] = data_df["Timestamp"] - first_ts
    data_df.to_csv(os.path.join(OUTPUT_DIR, "Time-Processed", "Separated", bname))
    data_df["Test ID"] = test_id
    if type_id == "CU-CPData":
        if all_time_cu_cp_processed_df is None:
            all_time_cu_cp_processed_df = data_df
        else:
            all_time_cu_cp_processed_df = pd.concat([all_time_cu_cp_processed_df, data_df])
    elif type_id == "CU-UPData":
        if all_time_cu_up_processed_df is None:
            all_time_cu_up_processed_df = data_df
        else:
            all_time_cu_up_processed_df = pd.concat([all_time_cu_up_processed_df, data_df])
    elif type_id == "DUData":
        if all_time_du_processed_df is None:
            all_time_du_processed_df = data_df
        else:
            all_time_du_processed_df = pd.concat([all_time_du_processed_df, data_df])
all_time_cu_cp_processed_df.sort_values(["Test ID", "Timestamp"]).reset_index(drop=True).to_csv(os.path.join("Time-Processed", "CU-CPData_Merged.csv"))
all_time_cu_up_processed_df.sort_values(["Test ID", "Timestamp"]).reset_index(drop=True).to_csv(os.path.join("Time-Processed", "CU-UPData_Merged.csv"))
all_time_du_processed_df.sort_values(["Test ID", "Timestamp"]).reset_index(drop=True).to_csv(os.path.join("Time-Processed", "DUData_Merged.csv"))

## Dataset Markdown file generation

In [None]:
now = datetime.datetime.now(datetime.timezone.utc)
pretty_time = now.strftime("%A, %-d %B %Y, at %H:%M (%Z)")
iso_time = now.isoformat()
template_lines = [
    f"# xInfoDump dataset {OUTPUT_DIR}\n",
    f"Exported on {pretty_time}\n\n",
    "## Technical data\n",
    f"Test scenario: `{TEST_SCENARIO}`\n"
    f"Exportation time: `{iso_time}`\n",
    f"Number of tests: {len(OG_FILES)}\n\n",
    "> Automatically generated by the data processing notebook"
]
with open(os.path.join(OUTPUT_DIR, "README.md"), 'w') as out_md:
    out_md.writelines(template_lines)