In [6]:

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import os
import random
from joblib import Parallel , delayed


In [7]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"


In [8]:
vehicleType = "Saloon - 4 Dr"


In [9]:
bucket3 = "scope_case"
myFilter =  lambda x : x["Vehicle_Type"] == vehicleType
targetCols = ["CaseID", "Vehicle_Type"]
caseDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,partition_filter=myFilter,dataset=True, columns=targetCols)
caseDf

Unnamed: 0,CaseID,Vehicle_Type
0,10000016,Saloon - 4 Dr
1,10000030,Saloon - 4 Dr
2,10000038,Saloon - 4 Dr
3,10000063,Saloon - 4 Dr
4,10000072,Saloon - 4 Dr
...,...,...
255152,13562973,Saloon - 4 Dr
255153,13563018,Saloon - 4 Dr
255154,13563063,Saloon - 4 Dr
255155,13563071,Saloon - 4 Dr


In [10]:
bucketName = "raw_imgs"
cli = boto3.client(
    "s3",
    **{
        "endpoint_url": "http://192.168.1.4:8333",
    }
)
paginator = cli.get_paginator("list_objects_v2")
operation_parameters = {"Bucket": bucketName}
page_iterator = paginator.paginate(**operation_parameters)
downloadedImgs = []
for page in tqdm(page_iterator):
    pageContent = page["Contents"]
    downloadedCaseId = set([int(x["Key"].split("_")[0]) for x in pageContent])
    downloadedImgs.extend(downloadedCaseId)
print(len(downloadedImgs))

52it [00:50,  1.03it/s]

103778





In [11]:
availableImgsCase = set(downloadedImgs).intersection(caseDf["CaseID"].unique().tolist())
downloadedCaseLength = len(availableImgsCase)

In [12]:
notDownloadedCaseLength = len(caseDf) - downloadedCaseLength

In [13]:
availableRatio = len(availableImgsCase) / len(caseDf)
availableRatio

0.15936854564052721

In [14]:
trackDf = pd.json_normalize([{
    "label" : "available",
    "cases" : downloadedCaseLength
},
                             {
    "label" : "not_available",
    "cases" : notDownloadedCaseLength
}
                             
                             
                             ])

In [15]:
import plotly.express as px

fig = px.pie(data_frame=trackDf, values="cases", labels="label")
fig.show()

In [16]:
bucket3 = "scope_file"
targetCols = ["CaseID", "iDOCID"]
filesDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,dataset=True, columns=targetCols)
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
0,10375210,456204471.0,Front View
1,10839776,479277541.0,Front View
2,10595863,518945875.0,Front View
3,10324518,453575575.0,Front View
4,10810372,477799896.0,Front View
...,...,...,...
2879426,12981985,651301700.0,Rear View Right
2879427,13548901,651304978.0,Rear View Right
2879428,13313567,652815210.0,Rear View Right
2879429,13551639,654542221.0,Rear View Right


In [17]:
filesDf = filesDf[filesDf["CaseID"].isin(availableImgsCase)]
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
9,10000123,437299477.0,Front View
10,10000358,437300189.0,Front View
11,10000449,437304372.0,Front View
14,10000450,437321843.0,Front View
16,10000515,437326556.0,Front View
...,...,...,...
2795329,13074116,595377127.0,Rear View Right
2795657,13067177,595451910.0,Rear View Right
2796900,13079952,595854084.0,Rear View Right
2799008,13068882,596511595.0,Rear View Right


In [18]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
srcBucketName = "multilabel_df"
labelDf = wr.s3.read_parquet(
    path=f"s3://{srcBucketName}/",
    dataset=True
)
balanceLabelDf = labelDf[labelDf["CaseID"].isin(filesDf["CaseID"].unique().tolist())]
balanceLabelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear,CaseID,Circumstances_of_Accident
151,1,1,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12660475,
455,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,12665165,Collided into a tree
567,1,1,1,1,1,0,0,1,0,1,...,1,0,0,0,0,0,0,1,12666515,Collision- Head on collision
682,1,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,12668597,Collision- Head to Rear (Insured Hit TP)
892,1,1,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12671639,Collided into lamp post
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598119,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10138474,
598120,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10143783,
598121,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10147236,
598123,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174259,


In [19]:
all_parts = [x for x in balanceLabelDf.columns if "vision_" in x]
all_parts

['vision_bonnet',
 'vision_bumper_front',
 'vision_grille',
 'vision_headlamp_rh',
 'vision_headlamp_lh',
 'vision_door_front_lh',
 'vision_door_front_rh',
 'vision_engine',
 'vision_bumper_rear',
 'vision_misc',
 'vision_front_panel',
 'vision_non_external',
 'vision_wheel',
 'vision_fender_front_lh',
 'vision_fender_front_rh',
 'vision_rear_quarter_lh',
 'vision_tail_lamp_lh',
 'vision_tail_lamp_rh',
 'vision_windscreen_front',
 'vision_rear_compartment',
 'vision_rear_panel',
 'vision_rear_quarter_rh',
 'vision_door_rear_rh',
 'vision_door_rear_lh']

In [20]:
# partToBalance = [
#     "vision_door_rear_lh",
#     "vision_door_rear_rh", 
#     "vision_rear_quarter_lh", 
#     "vision_rear_quarter_rh",
#     "vision_tail_lamp_lh",
#     "vision_tail_lamp_rh",
#     "vision_windscreen_front",
#     "vision_door_front_lh",
#     "vision_door_front_rh",

#     ]
# caseIdForBalance = []
# balanceSampleCount = 3000
# for part in partToBalance:
#     targetCaseId = balanceLabelDf[balanceLabelDf[part] == 1].head(balanceSampleCount)["CaseID"].unique().tolist()
#     caseIdForBalance.extend(targetCaseId)
# uniqueCaseIdForBalance = set(caseIdForBalance)
# print(len(uniqueCaseIdForBalance))

In [21]:
filesDf["filename"] = filesDf[['CaseID', "iDOCID"]].apply(lambda x : str(int(x["CaseID"])) + "_" + str(int(x["iDOCID"])) + ".JPG", axis=1)
filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



9          10000123_437299477.JPG
10         10000358_437300189.JPG
11         10000449_437304372.JPG
14         10000450_437321843.JPG
16         10000515_437326556.JPG
                    ...          
2795329    13074116_595377127.JPG
2795657    13067177_595451910.JPG
2796900    13079952_595854084.JPG
2799008    13068882_596511595.JPG
2817679    12913298_602508321.JPG
Name: filename, Length: 199412, dtype: object

In [22]:
endpoint = "http://192.168.1.4:8888/buckets/raw_imgs/"
filesDf["url"] = endpoint + filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
totalCase = 10000
numCaseToDownload = totalCase

numCaseToDownload


10000

In [24]:
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
9,10000123,437299477.0,Front View,10000123_437299477.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
10,10000358,437300189.0,Front View,10000358_437300189.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
11,10000449,437304372.0,Front View,10000449_437304372.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
14,10000450,437321843.0,Front View,10000450_437321843.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
16,10000515,437326556.0,Front View,10000515_437326556.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
...,...,...,...,...,...
2795329,13074116,595377127.0,Rear View Right,13074116_595377127.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13074...
2795657,13067177,595451910.0,Rear View Right,13067177_595451910.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13067...
2796900,13079952,595854084.0,Rear View Right,13079952_595854084.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13079...
2799008,13068882,596511595.0,Rear View Right,13068882_596511595.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13068...


In [25]:
targetCaseId = filesDf['CaseID'].unique().tolist()
random.shuffle(targetCaseId)
targetCaseId = targetCaseId[:numCaseToDownload]
# targetCaseId.extend(uniqueCaseIdForBalance)
targetCaseId = list(set(targetCaseId))
if(len(targetCaseId) < totalCase):
    remainder = totalCase - len(targetCaseId)
    caseToAdd = filesDf[~filesDf['CaseID'].isin(targetCaseId)]["CaseID"].tolist()[:remainder]
    targetCaseId.extend(caseToAdd)

In [26]:
downloadFileDf = filesDf[filesDf["CaseID"].isin(targetCaseId)]

In [27]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
16,10000515,437326556.0,Front View,10000515_437326556.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
19,10001070,437336634.0,Front View,10001070_437336634.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
20,10001217,437355562.0,Front View,10001217_437355562.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
24,10001500,437374687.0,Front View,10001500_437374687.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
30,10001364,437380493.0,Front View,10001364_437380493.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
...,...,...,...,...,...
2794109,13071067,594976669.0,Rear View Right,13071067_594976669.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13071...
2794399,13072671,595064894.0,Rear View Right,13072671_595064894.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13072...
2794423,13076951,595070636.0,Rear View Right,13076951_595070636.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
2794570,13075282,595114812.0,Rear View Right,13075282_595114812.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13075...


In [28]:
print(len(targetCaseId))

10000


In [29]:
sinkDir = "/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs"
# localFiles = os.listdir(sinkDir)
# downloadFileDf = downloadFileDf[~downloadFileDf["filename"].isin(localFiles)]


In [30]:
def transport_worker(df:pd.DataFrame):
    s = requests.Session()
    for _, i in tqdm(df.iterrows(), desc="files"):
        url = i["url"]
        imgBytes = s.get(url)
        localPath = os.path.join(sinkDir, i["filename"])
        with open(localPath, "wb") as f:
            f.write(imgBytes.content)

In [31]:

os.makedirs(sinkDir, exist_ok=True)
s = requests.Session()
batchTask = []
batchSize = 1000
for i in range(0, len(downloadFileDf), batchSize):
    batchTask.append(downloadFileDf.iloc[i : i + batchSize])
Parallel(n_jobs=10)(delayed(transport_worker)(taskDf) for taskDf in tqdm(batchTask, desc="tasks"))
# for _, i in tqdm(downloadFileDf.iterrows()):
#     url = i["url"]
#     imgBytes = s.get(url)
#     localPath = os.path.join(sinkDir, i["filename"])
#     with open(localPath, "wb") as f:
#         f.write(imgBytes.content)
    


files: 1000it [03:54,  4.26it/s]0:19<00:00, 64.61it/s]
files: 1000it [04:00,  4.16it/s]3:55<06:55, 13.84s/it]
files: 1000it [04:02,  4.13it/s]
files: 1000it [04:02,  4.12it/s]
files: 1000it [04:04,  4.08it/s]
files: 1000it [04:04,  4.08it/s]
files: 1000it [04:08,  4.03it/s]
files: 1000it [04:08,  4.02it/s]
files: 1000it [04:15,  3.92it/s]
files: 1000it [04:16,  3.89it/s]
files: 1000it [03:44,  4.45it/s]
files: 1000it [03:50,  4.34it/s]7:50<06:05, 18.25s/it]
files: 1000it [03:56,  4.22it/s]
files: 1000it [03:51,  4.31it/s]
files: 1000it [03:49,  4.36it/s]
files: 1000it [03:52,  4.30it/s]
files: 1000it [03:54,  4.27it/s]
files: 1000it [03:53,  4.28it/s]
files: 1000it [03:53,  4.28it/s]
files: 1000it [03:53,  4.29it/s]
files: 1000it [03:32,  4.70it/s]
files: 1000it [03:35,  4.64it/s]1:28<03:16, 19.66s/it]
files: 1000it [03:42,  4.49it/s]
files: 1000it [03:30,  4.74it/s]
files: 1000it [03:39,  4.55it/s]
files: 1000it [03:43,  4.48it/s]
files: 1000it [03:33,  4.69it/s]
files: 1000it [03:44,

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [32]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
16,10000515,437326556.0,Front View,10000515_437326556.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
19,10001070,437336634.0,Front View,10001070_437336634.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
20,10001217,437355562.0,Front View,10001217_437355562.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
24,10001500,437374687.0,Front View,10001500_437374687.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
30,10001364,437380493.0,Front View,10001364_437380493.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
...,...,...,...,...,...
2794109,13071067,594976669.0,Rear View Right,13071067_594976669.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13071...
2794399,13072671,595064894.0,Rear View Right,13072671_595064894.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13072...
2794423,13076951,595070636.0,Rear View Right,13076951_595070636.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
2794570,13075282,595114812.0,Rear View Right,13075282_595114812.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13075...


In [35]:
downloadFileDf.to_parquet(f"../../data/imgs_metadata/{vehicleType}.parquet")

In [36]:
pd.read_parquet(f"/home/alextay96/Desktop/all_workspace/new_workspace/DLDataPipeline/data/imgs_metadata/{vehicleType}.parquet")

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
16,10000515,437326556.0,Front View,10000515_437326556.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
19,10001070,437336634.0,Front View,10001070_437336634.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
20,10001217,437355562.0,Front View,10001217_437355562.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
24,10001500,437374687.0,Front View,10001500_437374687.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
30,10001364,437380493.0,Front View,10001364_437380493.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
...,...,...,...,...,...
2794109,13071067,594976669.0,Rear View Right,13071067_594976669.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13071...
2794399,13072671,595064894.0,Rear View Right,13072671_595064894.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13072...
2794423,13076951,595070636.0,Rear View Right,13076951_595070636.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13076...
2794570,13075282,595114812.0,Rear View Right,13075282_595114812.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13075...
