In [65]:
import polars as pl
import shutil
from tqdm import tqdm
from pathlib import Path

In [66]:
df_train = pl.read_csv("data/train.csv")
df_test = pl.read_csv("data/test.csv")
df_valid = pl.read_csv("data/valid.csv")

In [67]:
df_train.columns

['Path',
 'Sex',
 'Age',
 'Frontal/Lateral',
 'AP/PA',
 'No Finding',
 'Enlarged Cardiomediastinum',
 'Cardiomegaly',
 'Lung Opacity',
 'Lung Lesion',
 'Edema',
 'Consolidation',
 'Pneumonia',
 'Atelectasis',
 'Pneumothorax',
 'Pleural Effusion',
 'Pleural Other',
 'Fracture',
 'Support Devices']

In [68]:
pathlogies = df_test.columns[5:]

In [69]:
def reduce_df(df, pathlogies, reduce):
    target_height = int(df.height * reduce)
    print(f"Reduce the dataset from {df.height} to {target_height}")

    print(df["Lung Lesion"].value_counts())
    df_clean = df.with_columns(
        pl.col(pathlogies).fill_null(0).replace(-1, 1)
    )
    print(df_clean["Lung Lesion"].value_counts())

    df_lung_lesion = df_clean.filter(
        pl.col("Lung Lesion").eq(1)
    )

    df_other = df_clean.filter(
        pl.col("Lung Lesion").eq(0)
    )

    df_other = df_other.sample(target_height - df_lung_lesion.height, seed=42)

    df_res = pl.concat([df_lung_lesion, df_other])
    print(f"df reduce actual height: {df_res.height}")
    print(df_res["Lung Lesion"].value_counts())

    return df_res

In [70]:
df_train_reduced = reduce_df(df_train, pathlogies, reduce=0.5)


Reduce the dataset from 152821 to 76410
shape: (4, 2)
┌─────────────┬────────┐
│ Lung Lesion ┆ count  │
│ ---         ┆ ---    │
│ f64         ┆ u32    │
╞═════════════╪════════╡
│ null        ┆ 145673 │
│ 1.0         ┆ 5653   │
│ -1.0        ┆ 874    │
│ 0.0         ┆ 621    │
└─────────────┴────────┘
shape: (2, 2)
┌─────────────┬────────┐
│ Lung Lesion ┆ count  │
│ ---         ┆ ---    │
│ f64         ┆ u32    │
╞═════════════╪════════╡
│ 1.0         ┆ 6527   │
│ 0.0         ┆ 146294 │
└─────────────┴────────┘
df reduce actual height: 76410
shape: (2, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 1.0         ┆ 6527  │
│ 0.0         ┆ 69883 │
└─────────────┴───────┘


In [71]:
df_test_reduced = reduce_df(df_test, pathlogies, reduce=0.5)

Reduce the dataset from 19103 to 9551
shape: (4, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 0.0         ┆ 71    │
│ -1.0        ┆ 109   │
│ 1.0         ┆ 696   │
│ null        ┆ 18227 │
└─────────────┴───────┘
shape: (2, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 1.0         ┆ 805   │
│ 0.0         ┆ 18298 │
└─────────────┴───────┘


df reduce actual height: 9551
shape: (2, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 0.0         ┆ 8746  │
│ 1.0         ┆ 805   │
└─────────────┴───────┘


In [72]:
df_valid_reduced = reduce_df(df_valid, pathlogies, reduce=0.5)

Reduce the dataset from 19103 to 9551
shape: (4, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ -1.0        ┆ 125   │
│ 1.0         ┆ 691   │
│ 0.0         ┆ 84    │
│ null        ┆ 18203 │
└─────────────┴───────┘
shape: (2, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 0.0         ┆ 18287 │
│ 1.0         ┆ 816   │
└─────────────┴───────┘
df reduce actual height: 9551
shape: (2, 2)
┌─────────────┬───────┐
│ Lung Lesion ┆ count │
│ ---         ┆ ---   │
│ f64         ┆ u32   │
╞═════════════╪═══════╡
│ 1.0         ┆ 816   │
│ 0.0         ┆ 8735  │
└─────────────┴───────┘


In [73]:
df_train_reduced = df_train_reduced.with_columns(dest = pl.col("Path").str.replace_all("data", "data_reduced"))
df_valid_reduced = df_valid_reduced.with_columns(dest = pl.col("Path").str.replace_all("data", "data_reduced"))
df_test_reduced = df_test_reduced.with_columns(dest = pl.col("Path").str.replace_all("data", "data_reduced"))

In [74]:
df_train_reduced.select(["Path", "dest"]).head(3)

Path,dest
str,str
"""data/train/patient17164/study1…","""data_reduced/train/patient1716…"
"""data/train/patient36359/study7…","""data_reduced/train/patient3635…"
"""data/train/patient07602/study1…","""data_reduced/train/patient0760…"


In [75]:
def move_files(df):
    for row in tqdm(df.iter_rows(named=True)):
        Path(row["dest"]).parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(row["Path"], row["dest"])

In [76]:
move_files(df_train_reduced)
move_files(df_valid_reduced)
move_files(df_test_reduced)

0it [00:00, ?it/s]

76410it [01:48, 706.01it/s] 
9551it [00:16, 579.43it/s]
9551it [00:14, 672.93it/s]


In [77]:
df_train_reduced = df_train_reduced.drop("Path").rename({"dest": "Path"}).select(df_train.columns)
df_train_reduced.write_csv("data_reduced/train.csv")
df_valid_reduced = df_valid_reduced.drop("Path").rename({"dest": "Path"}).select(df_train.columns)
df_valid_reduced.write_csv("data_reduced/valid.csv")
df_test_reduced = df_test_reduced.drop("Path").rename({"dest": "Path"}).select(df_train.columns)
df_test_reduced.write_csv("data_reduced/test.csv")