In [8]:
import pandas as pd

pd.set_option("display.max_rows", 300)
import rapidfuzz
from joblib import Parallel, delayed
from tqdm import tqdm
import re
import awswrangler as wr
import boto3
import copy
import itertools
from pprint import pprint


In [9]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
srcBucketName = "multilabel_df"
labelDf = wr.s3.read_parquet(
    path=f"s3://{srcBucketName}/",
    dataset=True
)
labelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear,CaseID,Circumstances_of_Accident
0,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,12657878,Collision- Head to Rear (Insured Hit TP)
1,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,12657888,Lost control- Overturned
2,1,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657890,
3,1,1,1,0,0,0,1,1,0,1,...,1,0,0,0,0,0,0,0,12657900,Collided into animal
4,1,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657903,Collision- Head to Rear (Insured Hit TP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598121,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10147236,
598122,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10936139,Collision- Head to Rear (TP Hit Insured)
598123,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174259,
598124,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174498,


In [10]:
mappingDf = pd.read_csv("/home/alextay96/Desktop/all_workspace/new_workspace/DLDataPipeline/data/tmp/complete_view_mapping.csv")

In [11]:
mappingDf["lvl_3_desc"] = mappingDf["lvl_3_desc"].str.strip()

In [12]:
view = ["front_view", "rear_view", "front_view_left", "front_view_right", "rear_view_left", "rear_view_right"]
viewToPart = dict()
for v in view:
    partsInView = mappingDf[mappingDf[v] == 1]["lvl_3_desc"].unique().tolist()
    viewToPart[v] = ["vision_" + x.replace(" ", "_") for x in partsInView]
pprint(viewToPart)
    

{'front_view': ['vision_bonnet',
                'vision_bumper_front',
                'vision_engine',
                'vision_front_panel',
                'vision_grille',
                'vision_headlamp_lh',
                'vision_headlamp_rh',
                'vision_misc',
                'vision_non_external',
                'vision_windscreen_front'],
 'front_view_left': ['vision_bonnet',
                     'vision_bumper_front',
                     'vision_door_front_lh',
                     'vision_door_rear_lh',
                     'vision_engine',
                     'vision_fender_front_lh',
                     'vision_front_panel',
                     'vision_headlamp_lh',
                     'vision_non_external',
                     'vision_misc',
                     'vision_wheel',
                     'vision_windscreen_front'],
 'front_view_right': ['vision_bonnet',
                      'vision_bumper_front',
                      'vision_door_front_r

In [13]:
vehicleType= "Saloon - 4 Dr"
localFileMetadata = f"/home/alextay96/Desktop/all_workspace/new_workspace/DLDataPipeline/data/imgs_metadata/{vehicleType}.parquet"

In [14]:
localFilesDf = pd.read_parquet(localFileMetadata)
localFilesDf["StdDocDesc"] = localFilesDf["StdDocDesc"].str.replace(" ", "_").str.lower()
localFilesDf.sort_values(by="CaseID", inplace=True)
localFilesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
1581,10000038,438742010.0,front_view,10000038_438742010.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
2285288,10000038,438742000.0,rear_view_right,10000038_438742000.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
280588,10000038,438742006.0,front_view_left,10000038_438742006.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
1474120,10000038,438742004.0,rear_view,10000038_438742004.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
878715,10000038,438742078.0,front_view_right,10000038_438742078.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
...,...,...,...,...,...
789736,13076951,595070513.0,front_view_left,13076951_595070513.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
1387864,13076951,595070498.0,front_view_right,13076951_595070498.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
2196298,13076951,595070634.0,rear_view_left,13076951_595070634.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
2794423,13076951,595070636.0,rear_view_right,13076951_595070636.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...


In [15]:
viewDfMap = dict()
for v, parts in viewToPart.items():
    tempDf = labelDf[parts + ["CaseID"]]
    tempDf["view"] = v
    viewFilesDf = localFilesDf[localFilesDf["StdDocDesc"] == v]
    viewFilesDf.drop_duplicates(subset=["CaseID"], inplace=True)
    tempDf = tempDf.merge(viewFilesDf[["filename", "CaseID"]], on="CaseID")
    viewDfMap[v] = tempDf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDf["view"] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  viewFilesDf.drop_duplicates(subset=["CaseID"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDf["view"] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [16]:
for view, df in viewDfMap.items():
    imgLabelFilename = f"{vehicleType}_{view}_img_labels.csv"
    wr.s3.to_csv(
        df=df,
    path=f"s3://imgs_labels/{imgLabelFilename}",

    )

In [17]:
for view, df in viewDfMap.items():
    imgLabelFilename = f"{vehicleType}_{view}_img_labels.csv"
    df = wr.s3.read_csv(
    path=f"s3://imgs_labels/{imgLabelFilename}",

    )
    print(df)

      Unnamed: 0  vision_bonnet  vision_bumper_front  vision_engine  \
0              0              1                    1              0   
1              1              1                    1              1   
2              2              1                    1              1   
3              3              1                    1              1   
4              4              1                    1              1   
...          ...            ...                  ...            ...   
5000        5000              0                    0              0   
5001        5001              0                    0              0   
5002        5002              0                    0              0   
5003        5003              0                    0              0   
5004        5004              0                    0              0   

      vision_front_panel  vision_grille  vision_headlamp_lh  \
0                      0              1                   0   
1                    