# Data processing

We imagine we have the following folders and files, with the same naming conventions:

```bash
.
├── MATLAB
│   ├── README.md
│   ├── grain.m
│   ├── import_stl.mlx
│   ├── rev.m
├── REV1_600
│   ├── REV1_6003D_model
│   │   ├── Spec-1.STL
│   │   ├── Spec-2.STL
│   │   └── ...
│   ├── REV1_600Slices
│   │   ├── 1pics
│   │   │   ├── Spec-1_Imgs
│   │   │   │   ├── *.png
│   │   │   │   ├── *.png
│   │   │   │   └── ...
│   │   │   ├── Spec-2_Imgs
│   │   │   │   ├── *.png
│   │   │   │   ├── *.png
│   │   │   │   └── ...
│   │   │   ├── ...
│   │   ├── 3pics
│   │   └── ...
│   └── fabrics.txt
├── README.md
└── data_processing.ipynb <- YOU ARE CURRENTLY HERE
```

The idea is to construct a dataframe with every fabrics and only some sliced images (or all).

In [107]:
import numpy as numpy
import pandas as pd
from pathlib import Path
from pprint import pprint
from typing import List
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torchvision.io import read_image
import os
import io

seed = 42

## Construction of the dataframe

The dataframe is constructed as follow:

|  | id | orientation_0_std | ... | aspect_ratio_1_mean | aspect_ratio_1_std | size_mean | size_std | solidity_mean | solidity_std | roundness_mean | roundness_std | volume_fraction_mean | 6_images |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Spec-1 | 0.324161 | ... | 0.7864 | 0.138574 | 0.104945 | 13.688080 | 4.475980 | 0.851367 | 0.097545 | 0.210755 | 0.086931 | [/home/matias/microstructure-reconstruction/MA... |
| 1 | Spec-2 | 0.373141 | ... | 0.847 | 0.138043 | 0.091564 | 6.019655 | 2.113804 | 0.965272 | 0.030363 | 0.297964 | 0.101607 | [/home/matias/microstructure-reconstruction/MA... |



In [20]:
topLevelFolder = Path() / "REV1_600"
path_to_revs = topLevelFolder / "REV1_6003D_model"
path_to_fabrics =  topLevelFolder / "fabrics.txt"
path_to_slices = topLevelFolder / "REV1_600Slices"

In [113]:
nb_images = 3

fabrics_df = pd.read_csv(path_to_fabrics)
path_to_images = [x for x in path_to_slices.glob(f"{nb_images}p*/")]


def associate_rev_id_to_its_images(id: str, path_to_slices: Path, nb_images: int):
    """Associates a rev id, like `Spec-5` to its sliced images, given a number of slices per plane and

    Args:
        id (str): id of the rev id
        path_to_slices (Path): path to the slices images
        nb_images (int): number of slices per plane

    Returns:
        List[Path]: list of path where each path points an image
    """
    return [x for x in path_to_slices.glob(f"{nb_images}p*/{id}_Imgs/*")]


# We create a new column where each cell is a list of image paths
fabrics_df["photos"] = fabrics_df["id"].apply(
    func=associate_rev_id_to_its_images, args=(path_to_slices, nb_images)
)
fabrics_df = fabrics_df[fabrics_df.photos.str.len().gt(0)]
fabrics_df

Unnamed: 0,id,orientation-0_mean,orientation-0_std,orientation-1_mean,orientation-1_std,orientation-2_mean,orientation-2_std,orientation-3_mean,orientation-3_std,orientation-4_mean,...,aspectratio-1_mean,aspectratio-1_std,size_mean,size_std,solidity_mean,solidity_std,roundness_mean,roundness_std,volume_fraction,photos
1,Spec-1,0.324161,0.354717,0.321122,0.023721,0.022712,0.017081,0.307251,0.314178,0.307708,...,0.138574,0.104945,13.688080,4.475980,0.851367,0.097545,0.210755,0.086931,0.386223,[REV1_600/REV1_600Slices/3pics/Spec-1_Imgs/x-y...
2,Spec-10,0.342926,0.329461,0.327613,0.003943,-0.008105,-0.007804,0.309317,0.301727,0.307888,...,0.141167,0.097689,12.133247,3.346814,0.875065,0.051042,0.209093,0.082589,0.337122,[REV1_600/REV1_600Slices/3pics/Spec-10_Imgs/x-...
3,Spec-100,0.343094,0.346566,0.310340,-0.000564,0.023228,-0.008377,0.316436,0.317267,0.300596,...,0.140470,0.097848,13.673221,4.078330,0.865932,0.071402,0.208290,0.084531,0.339295,[REV1_600/REV1_600Slices/3pics/Spec-100_Imgs/x...
4,Spec-101,0.342479,0.330274,0.327246,-0.004556,-0.004594,0.001348,0.319543,0.311533,0.311565,...,0.141739,0.104199,9.809217,2.654313,0.874050,0.054507,0.212802,0.090636,0.160133,[REV1_600/REV1_600Slices/3pics/Spec-101_Imgs/y...
5,Spec-102,0.331517,0.347090,0.321393,-0.003134,0.006752,-0.003562,0.311683,0.312832,0.300369,...,0.137749,0.101959,11.310906,3.005233,0.875842,0.057680,0.215132,0.086989,0.278125,[REV1_600/REV1_600Slices/3pics/Spec-102_Imgs/y...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Spec-186,0.332370,0.320139,0.347491,-0.013794,-0.000316,-0.008002,0.308273,0.309078,0.313318,...,0.136852,0.103046,8.743409,2.387816,0.876205,0.058424,0.212429,0.086477,0.106842,[REV1_600/REV1_600Slices/3pics/Spec-186_Imgs/y...
98,Spec-187,0.347766,0.329514,0.322721,0.004598,-0.004517,0.026167,0.308953,0.303466,0.301224,...,0.142359,0.102347,8.887476,2.441451,0.874403,0.058699,0.209269,0.086716,0.103700,[REV1_600/REV1_600Slices/3pics/Spec-187_Imgs/x...
99,Spec-188,0.331884,0.318141,0.349975,-0.013346,-0.000843,0.007494,0.310789,0.308175,0.318548,...,0.136650,0.100741,10.618896,2.911079,0.877366,0.052128,0.214151,0.085695,0.220278,[REV1_600/REV1_600Slices/3pics/Spec-188_Imgs/x...
100,Spec-189,0.338384,0.325607,0.336010,-0.004439,-0.009759,-0.008986,0.310169,0.309082,0.312984,...,0.140263,0.104427,11.188183,3.004806,0.876578,0.052851,0.216790,0.088561,0.281035,[REV1_600/REV1_600Slices/3pics/Spec-189_Imgs/x...


## Simple CNN network

The original problem is:

    (P) Compute the fabrics from multiple sliced images per rev

We are firstly having an intermediate objective:

    (P') Compute the fabrics from one single sliced images per rev

Of course, the accuracy will be much worse because the fabrics are computed in every direction. However, it is a good try.

In [119]:
class SinglePhotoDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        assert df.photos.dtypes == list

        # df.reset_index(inplace=True)

        nb_images = df.photos.str.len().max()
        assert nb_images == df.photos.str.len().min()
        
        self.labels = pd.concat([df] * nb_images, ignore_index=True)

        self.images = self.labels.apply(
            func=lambda row: str(row["photos"][row.name % nb_images]),
            axis=1,
        )

        self.labels.drop(columns=["id", "photos"], inplace=True)

        self.images = self.images.to_numpy()
        self.labels = self.labels.to_numpy()
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        if isinstance(idx, list):
            image = [read_image(path) for path in img_path]
        else:
            image = read_image(img_path)
        label = self.labels[idx, :]
        return image, label

dataset = SinglePhotoDataset(fabrics_df)
print(dataset[[0, 1]])
# train_df, test_df = train_test_split(
#     fabrics_df_single_entries,
#     train_size=0.7,
#     random_state=seed,
#     shuffle=True,
# )

# print(train_df.dtypes)

# mean = train_df.mean()
# std = train_df.std()

# normalized_fabrics_df_single_entries = (fabrics_df_single_entries - mean) / std
# normalized_train_df = (train_df - mean) / std


([tensor([[[  0,   0,   0,  ...,   0,   0,   0],
         [141, 141, 141,  ..., 141, 141, 141],
         [141, 141, 141,  ..., 141, 141, 141],
         ...,
         [100, 100, 100,  ..., 100, 100, 100],
         [182, 182, 182,  ..., 182, 182, 182],
         [182, 182, 182,  ..., 182, 182, 182]]], dtype=torch.uint8), tensor([[[  0,   0,   0,  ...,   0,   0,   0],
         [126, 126, 126,  ..., 126, 126, 126],
         [126, 126, 126,  ..., 126, 126, 126],
         ...,
         [ 83,  83,  83,  ...,  83,  83,  83],
         [164, 164, 164,  ..., 164, 164, 164],
         [164, 164, 164,  ..., 164, 164, 164]]], dtype=torch.uint8)], array([[ 3.24160759e-01,  3.54717353e-01,  3.21121888e-01,
         2.37208704e-02,  2.27120659e-02,  1.70805992e-02,
         3.07251209e-01,  3.14178453e-01,  3.07708210e-01,
         2.52943747e-01,  2.41885428e-01,  2.56148394e-01,
         6.93094089e-01,  4.14168592e-01,  1.38573650e-01,
         1.04945277e-01,  1.36880801e+01,  4.47597959e+00,
       