In [5]:
import pandas as pd

pd.set_option("display.max_rows", 300)
import rapidfuzz
from joblib import Parallel, delayed
from tqdm import tqdm
import re
import awswrangler as wr
import boto3
import copy
import itertools
from pprint import pprint


In [3]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"
srcBucketName = "multilabel_df"
labelDf = wr.s3.read_parquet(
    path=f"s3://{srcBucketName}/",
    dataset=True
)
labelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,rear_panel,undercarriage_rear,rear_quarter_rh,roof,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear,CaseID,Circumstances_of_Accident
0,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,12657878,Collision- Head to Rear (Insured Hit TP)
1,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,12657888,Lost control- Overturned
2,1,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657890,
3,1,1,1,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,12657900,Collided into animal
4,1,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657903,Collision- Head to Rear (Insured Hit TP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621787,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,10147236,
621788,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,10936139,Collision- Head to Rear (TP Hit Insured)
621789,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,10174259,
621790,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,10174498,


In [9]:
mappingDf = pd.read_csv("/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/tmp/complete_view_mapping.csv")

In [10]:
mappingDf["lvl_3_desc"] = mappingDf["lvl_3_desc"].str.strip()

In [11]:
view = ["front_view", "rear_view", "front_view_left", "front_view_right", "rear_view_left", "rear_view_right"]
viewToPart = dict()
for v in view:
    partsInView = mappingDf[mappingDf[v] == 1]["lvl_3_desc"].unique().tolist()
    viewToPart[v] = ["vision_" + x.replace(" ", "_") for x in partsInView]
pprint(viewToPart)
    

{'front_view': ['vision_bonnet',
                'vision_bumper_front',
                'vision_engine',
                'vision_front_panel',
                'vision_grille',
                'vision_headlamp_lh',
                'vision_headlamp_rh',
                'vision_misc',
                'vision_roof',
                'vision_undercarriage_front',
                'vision_windscreen_front'],
 'front_view_left': ['vision_bonnet',
                     'vision_bumper_front',
                     'vision_door_front_lh',
                     'vision_engine',
                     'vision_fender_front_lh',
                     'vision_front_panel',
                     'vision_headlamp_lh',
                     'vision_interior',
                     'vision_misc',
                     'vision_roof',
                     'vision_undercarriage_front',
                     'vision_wheel',
                     'vision_windscreen_front'],
 'front_view_right': ['vision_bonnet',
          

In [30]:
localFilesDf = pd.read_parquet("/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs_metadata/Saloon - 4 Dr.parquet")
localFilesDf["StdDocDesc"] = localFilesDf["StdDocDesc"].str.replace(" ", "_").str.lower()
localFilesDf.sort_values(by="CaseID", inplace=True)
localFilesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
911122,10000016,437457777.0,front_view_right,10000016_437457777.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10000...
1754160,10000016,437457779.0,rear_view_left,10000016_437457779.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10000...
289330,10000016,437457776.0,front_view_left,10000016_437457776.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10000...
2375954,10000016,437457778.0,rear_view_right,10000016_437457778.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10000...
1651,10000038,438742010.0,front_view,10000038_438742010.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10000...
...,...,...,...,...,...
1445560,13081272,595315625.0,front_view_right,13081272_595315625.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13081...
823767,13081272,595315619.0,front_view_left,13081272_595315619.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13081...
2910374,13081272,595315672.0,rear_view_right,13081272_595315672.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13081...
1722827,13081272,595315659.0,rear_view,13081272_595315659.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13081...


In [31]:
viewDfMap = dict()
for v, parts in viewToPart.items():
    tempDf = labelDf[parts + ["CaseID"]]
    tempDf["view"] = v
    viewFilesDf = localFilesDf[localFilesDf["StdDocDesc"] == v]
    viewFilesDf.drop_duplicates(subset=["CaseID"], inplace=True)
    tempDf = tempDf.merge(viewFilesDf[["filename", "CaseID"]], on="CaseID")
    viewDfMap[v] = tempDf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDf["view"] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  viewFilesDf.drop_duplicates(subset=["CaseID"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDf["view"] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [32]:
for view, df in viewDfMap.items():
    imgLabelFilename = f"{view}_img_labels.csv"
    wr.s3.to_csv(
        df=df,
    path=f"s3://imgs_labels/{imgLabelFilename}",

    )

In [None]:
for view, df in viewDfMap.items():
    imgLabelFilename = f"{view}_img_labels.csv"
    df = wr.s3.read_csv(
    path=f"s3://imgs_labels/{imgLabelFilename}",

    )
    print(df)