In [None]:
#| default_exp pipal

# Loading PIPAL

> Loading PIPAL from our private server. The data is as is downloaded from their official webpage.

In [None]:
#| export
import re

import pandas as pd
from pathlib import Path
from natsort import natsorted
from fastcore.foundation import L

In [None]:
#| hide
path_root = Path("/media/disk/databases/BBDD_video_image/Image_Quality/PIPAL/")
path_ref = path_root/"Train_Ref"

In the root folder we can find 6 different folders:

- `Distortion_1`, `Distortion_2`, `Distortion_3`, `Distortion_4`: Contains distorted images. By the looks of it, the folder number doesn't have any special meaning, looks like a way of splitting the data for easier upload to *Google Drive*.
- `Train_Labels`: Contains `.txt` files relating every reference image to their distorted versions, as well as their MOS.
- `Train_Ref`: Contains the reference images.

Following the usual approach, we are going to try to build a `.csv` file relating each reference image, its distortions and the corresponding MOS. By doing so, we reduce a lot the possible complications when loading the data because it will be all paired. The idea is that we are going to fetch all the image paths and then will pair them using the `.txt` files:

## Pairing the images

> Let's pair each image with its distortions and their corresponding MOS.

By using `.glob()` we can get all the distortions and order them (this isn't really necessary, but doesn't do any harm).

In [None]:
paths_dist = L(natsorted(list(path_root.glob("Distortion_*/*.bmp"))))
paths_ref = L(natsorted(list(path_root.glob("Train_Ref/*.bmp"))))
paths_label = L(natsorted(list(path_root.glob("Train_Label/*.txt"))))

assert len(paths_ref) == len(paths_label)

We can load a `.txt` file to see it:

In [None]:
df = pd.read_csv(paths_label[0], header=None, names=["Distorted", "MOS"])
df.head()

Unnamed: 0,Distorted,MOS
0,A0001_00_00.bmp,1520.0648
1,A0001_00_01.bmp,1437.0798
2,A0001_00_02.bmp,1546.0616
3,A0001_00_03.bmp,1539.5688
4,A0001_00_04.bmp,1411.7958


Actually, as the images are properly names, we can extract its corresponding reference image from the path itself. This suggest that we can just load all the `.txt` files, put them together and then add the `Reference` column by extracting it from the distorted image path:

In [None]:
dfs_label = paths_label.map(pd.read_csv, header=None, names=["Distorted", "MOS"])

assert all(dfs_label[0] == df)
assert len(dfs_label) == len(paths_label)

Now that we have all the dataframes in an iterable, we can concatenate them together with `pd.concat`:

In [None]:
df_label = pd.concat(dfs_label)
assert len(df_label) == dfs_label.map(len).sum()
print(df_label.shape)
df_label.head()

(23200, 2)


Unnamed: 0,Distorted,MOS
0,A0001_00_00.bmp,1520.0648
1,A0001_00_01.bmp,1437.0798
2,A0001_00_02.bmp,1546.0616
3,A0001_00_03.bmp,1539.5688
4,A0001_00_04.bmp,1411.7958


In [None]:
df_label["Reference"] = df_label.Distorted.apply(lambda x: x.split("_")[0]+".bmp")
df_label.head()

Unnamed: 0,Distorted,MOS,Reference
0,A0001_00_00.bmp,1520.0648,A0001.bmp
1,A0001_00_01.bmp,1437.0798,A0001.bmp
2,A0001_00_02.bmp,1546.0616,A0001.bmp
3,A0001_00_03.bmp,1539.5688,A0001.bmp
4,A0001_00_04.bmp,1411.7958,A0001.bmp


As a fancy ending, we can reorder the columns so that we have them like `Reference, Distorted, MOS`:

In [None]:
df_label = df_label.reindex(columns=["Reference", "Distorted", "MOS"])
df_label.head()

Unnamed: 0,Reference,Distorted,MOS
0,A0001.bmp,A0001_00_00.bmp,1520.0648
1,A0001.bmp,A0001_00_01.bmp,1437.0798
2,A0001.bmp,A0001_00_02.bmp,1546.0616
3,A0001.bmp,A0001_00_03.bmp,1539.5688
4,A0001.bmp,A0001_00_04.bmp,1411.7958


Keep in mind that the distorted images are distributed across different folders so we have two options:

- Put all of them in the same folder.
- Add the corresponding folder to te `Distorted` columns.

The first one would be the easier solution but, in order to make it work with the datasets as it comes when you download it, we can try to prepend the folder to each path. To do so, we can build another dataframe relating the distorted images and their corresponding directory:

In [None]:
def extract_folder_ref(path):
    path = str(path)
    return re.search(r"(Distortion_\d+)/(A\d+_\w+.bmp)", path).groups()

In [None]:
dist2dir = {"Distorted":[], "Directory":[]}
for dir, dist in paths_dist.map(extract_folder_ref).unique():
    dist2dir["Distorted"].append(dist)
    dist2dir["Directory"].append(dir)

In [None]:
dist2dir = pd.DataFrame.from_dict(dist2dir, orient="columns")
dist2dir.head()

Unnamed: 0,Distorted,Directory
0,A0001_00_00.bmp,Distortion_1
1,A0001_00_01.bmp,Distortion_1
2,A0001_00_02.bmp,Distortion_1
3,A0001_00_03.bmp,Distortion_1
4,A0001_00_04.bmp,Distortion_1


In [None]:
df_label.head()

Unnamed: 0,Reference,Distorted,MOS
0,A0001.bmp,A0001_00_00.bmp,1520.0648
1,A0001.bmp,A0001_00_01.bmp,1437.0798
2,A0001.bmp,A0001_00_02.bmp,1546.0616
3,A0001.bmp,A0001_00_03.bmp,1539.5688
4,A0001.bmp,A0001_00_04.bmp,1411.7958


In [None]:
df_label_dir = df_label.merge(right=dist2dir, how="left", left_on="Distorted", right_on="Distorted")
df_label_dir.head()

Unnamed: 0,Reference,Distorted,MOS,Directory
0,A0001.bmp,A0001_00_00.bmp,1520.0648,Distortion_1
1,A0001.bmp,A0001_00_01.bmp,1437.0798,Distortion_1
2,A0001.bmp,A0001_00_02.bmp,1546.0616,Distortion_1
3,A0001.bmp,A0001_00_03.bmp,1539.5688,Distortion_1
4,A0001.bmp,A0001_00_04.bmp,1411.7958,Distortion_1


Now we are done! Let's save the `.csv` file:

In [None]:
path_root

PosixPath('/media/disk/databases/BBDD_video_image/Image_Quality/PIPAL')

In [None]:
#| notest
df_label_dir.to_csv(path_root/"image_pairs_mos.csv")