In [30]:

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import os
import random
from joblib import Parallel , delayed


In [31]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"


In [32]:
vehicleType = "SUV - 5 Dr"


In [33]:
bucket3 = "scope_case"
myFilter =  lambda x : x["Vehicle_Type"] == vehicleType
targetCols = ["CaseID", "Vehicle_Type"]
caseDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,partition_filter=myFilter,dataset=True, columns=targetCols)
caseDf

Unnamed: 0,CaseID,Vehicle_Type
0,10000037,SUV - 5 Dr
1,10000064,SUV - 5 Dr
2,10000066,SUV - 5 Dr
3,10000117,SUV - 5 Dr
4,10000125,SUV - 5 Dr
...,...,...
63737,13562831,SUV - 5 Dr
63738,13562839,SUV - 5 Dr
63739,13562932,SUV - 5 Dr
63740,13562968,SUV - 5 Dr


In [34]:
bucketName = "raw_imgs"
cli = boto3.client(
    "s3",
    **{
        "endpoint_url": "http://192.168.1.4:8333",
    }
)
paginator = cli.get_paginator("list_objects_v2")
operation_parameters = {"Bucket": bucketName}
page_iterator = paginator.paginate(**operation_parameters)
downloadedImgs = []
for page in tqdm(page_iterator):
    pageContent = page["Contents"]
    downloadedCaseId = set([int(x["Key"].split("_")[0]) for x in pageContent])
    downloadedImgs.extend(downloadedCaseId)
print(len(downloadedImgs))

52it [00:45,  1.15it/s]

103778





In [35]:
availableImgsCase = set(downloadedImgs).intersection(caseDf["CaseID"].unique().tolist())
downloadedCaseLength = len(availableImgsCase)

In [36]:
notDownloadedCaseLength = len(caseDf) - downloadedCaseLength

In [37]:
availableRatio = len(availableImgsCase) / len(caseDf)
availableRatio

0.34625521634087414

In [38]:
trackDf = pd.json_normalize([{
    "label" : "available",
    "cases" : downloadedCaseLength
},
                             {
    "label" : "not_available",
    "cases" : notDownloadedCaseLength
}
                             
                             
                             ])

In [39]:
import plotly.express as px

fig = px.pie(data_frame=trackDf, values="cases", labels="label")
fig.show()

In [40]:
bucket3 = "scope_file"
targetCols = ["CaseID", "iDOCID"]
filesDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,dataset=True, columns=targetCols)
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
0,10375210,456204471.0,Front View
1,10839776,479277541.0,Front View
2,10595863,518945875.0,Front View
3,10324518,453575575.0,Front View
4,10810372,477799896.0,Front View
...,...,...,...
2879426,12981985,651301700.0,Rear View Right
2879427,13548901,651304978.0,Rear View Right
2879428,13313567,652815210.0,Rear View Right
2879429,13551639,654542221.0,Rear View Right


In [41]:
filesDf = filesDf[filesDf["CaseID"].isin(availableImgsCase)]
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
25,10001668,437375090.0,Front View
42,10002172,437396585.0,Front View
86,10002757,437468727.0,Front View
117,10003903,437500662.0,Front View
119,10004377,437506049.0,Front View
...,...,...,...
2879298,13396834,630249507.0,Rear View Right
2879308,13465605,630958730.0,Rear View Right
2879378,13490232,636651487.0,Rear View Right
2879409,13553114,644215059.0,Rear View Right


In [42]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
srcBucketName = "multilabel_df"
labelDf = wr.s3.read_parquet(
    path=f"s3://{srcBucketName}/",
    dataset=True
)
balanceLabelDf = labelDf[labelDf["CaseID"].isin(filesDf["CaseID"].unique().tolist())]
balanceLabelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,Assembly_Type,Vehicle_Still_Driveable,NCB_Stat,Claim_Type,Vehicle_Type,Sum_Insured,Repairer,Repairer_Apprv_Count,Collision_With,Handling_Insurer
15,1,1,1,1,0,0,0,0,0,1,...,3,1,1,OD,SUV - 5 Dr,75000.0,Ngu Chung Beng & Sons Corporation Sdn Bhd. (Ng...,3169,Others,Chubb Insurance Malaysia Berhad
49,1,1,1,1,1,0,1,1,0,0,...,3,0,1,OD,SUV - 5 Dr,39600.0,Syarikat Kong Wah Auto Sdn Bhd (HQ),3681,Private Vehicle,The Pacific Insurance Bhd
55,1,1,1,1,1,0,0,0,0,1,...,3,0,1,OD,SUV - 5 Dr,42000.0,Tan Chong Ekspres Auto Servis Sdn Bhd (KL - JL...,1049,Private Vehicle,Berjaya Sompo Insurance Berhad
96,1,1,1,1,1,1,1,1,1,1,...,2,0,1,OD,SUV - 5 Dr,90000.0,Target Orion Star Sdn Bhd (Target Orion Star K...,365,Others,Etiqa General Takaful Berhad
113,1,1,1,0,1,0,0,1,0,0,...,2,0,1,OD,SUV - 5 Dr,94000.0,Warna Bestari Sdn Bhd (Warna Bestari),2869,Animal,Allianz General Insurance Company (Malaysia) B...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597896,0,0,0,0,0,0,0,1,0,0,...,3,1,0,OD EXW,SUV - 5 Dr,40000.0,Tan Chong Ekspres Auto Servis Sdn Bhd (KLANG),0,,RHB Insurance Berhad
597909,0,0,0,0,0,0,0,1,0,0,...,3,1,0,OD EXW,SUV - 5 Dr,40000.0,Tan Chong Ekspres Auto Servis Sdn Bhd (PJ),0,,RHB Insurance Berhad
598016,0,0,0,0,0,0,0,1,0,0,...,3,1,0,OD EXW,SUV - 5 Dr,40000.0,Tan Chong Ekspres Auto Servis Sdn Bhd (Batu Ca...,0,,RHB Insurance Berhad
598029,0,0,0,0,0,0,0,1,0,0,...,3,1,0,OD EXW,SUV - 5 Dr,40000.0,Tan Chong Ekspres Auto Servis Sdn Bhd (PJ),0,,RHB Insurance Berhad


In [43]:
all_parts = [x for x in balanceLabelDf.columns if "vision_" in x]
all_parts

['vision_bonnet',
 'vision_bumper_front',
 'vision_grille',
 'vision_headlamp_rh',
 'vision_headlamp_lh',
 'vision_door_front_lh',
 'vision_door_front_rh',
 'vision_engine',
 'vision_bumper_rear',
 'vision_misc',
 'vision_front_panel',
 'vision_non_external',
 'vision_wheel',
 'vision_fender_front_lh',
 'vision_fender_front_rh',
 'vision_rear_quarter_lh',
 'vision_tail_lamp_lh',
 'vision_tail_lamp_rh',
 'vision_windscreen_front',
 'vision_rear_compartment',
 'vision_rear_panel',
 'vision_rear_quarter_rh',
 'vision_door_rear_rh',
 'vision_door_rear_lh']

In [44]:
# partToBalance = [
#     "vision_door_rear_lh",
#     "vision_door_rear_rh", 
#     "vision_rear_quarter_lh", 
#     "vision_rear_quarter_rh",
#     "vision_tail_lamp_lh",
#     "vision_tail_lamp_rh",
#     "vision_windscreen_front",
#     "vision_door_front_lh",
#     "vision_door_front_rh",

#     ]
# caseIdForBalance = []
# balanceSampleCount = 3000
# for part in partToBalance:
#     targetCaseId = balanceLabelDf[balanceLabelDf[part] == 1].head(balanceSampleCount)["CaseID"].unique().tolist()
#     caseIdForBalance.extend(targetCaseId)
# uniqueCaseIdForBalance = set(caseIdForBalance)
# print(len(uniqueCaseIdForBalance))

In [45]:
filesDf["filename"] = filesDf[['CaseID', "iDOCID"]].apply(lambda x : str(int(x["CaseID"])) + "_" + str(int(x["iDOCID"])) + ".JPG", axis=1)
filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



25         10001668_437375090.JPG
42         10002172_437396585.JPG
86         10002757_437468727.JPG
117        10003903_437500662.JPG
119        10004377_437506049.JPG
                    ...          
2879298    13396834_630249507.JPG
2879308    13465605_630958730.JPG
2879378    13490232_636651487.JPG
2879409    13553114_644215059.JPG
2879412    13367788_645189473.JPG
Name: filename, Length: 113629, dtype: object

In [46]:
endpoint = "http://192.168.1.4:8888/buckets/raw_imgs/"
filesDf["url"] = endpoint + filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
totalCase = 10000
numCaseToDownload = totalCase

numCaseToDownload


10000

In [48]:
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
25,10001668,437375090.0,Front View,10001668_437375090.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10001...
42,10002172,437396585.0,Front View,10002172_437396585.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10002...
86,10002757,437468727.0,Front View,10002757_437468727.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10002...
117,10003903,437500662.0,Front View,10003903_437500662.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10003...
119,10004377,437506049.0,Front View,10004377_437506049.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
...,...,...,...,...,...
2879298,13396834,630249507.0,Rear View Right,13396834_630249507.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13396...
2879308,13465605,630958730.0,Rear View Right,13465605_630958730.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13465...
2879378,13490232,636651487.0,Rear View Right,13490232_636651487.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13490...
2879409,13553114,644215059.0,Rear View Right,13553114_644215059.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13553...


In [49]:
targetCaseId = filesDf['CaseID'].unique().tolist()
random.shuffle(targetCaseId)
targetCaseId = targetCaseId[:numCaseToDownload]
# targetCaseId.extend(uniqueCaseIdForBalance)
targetCaseId = list(set(targetCaseId))
if(len(targetCaseId) < totalCase):
    remainder = totalCase - len(targetCaseId)
    caseToAdd = filesDf[~filesDf['CaseID'].isin(targetCaseId)]["CaseID"].tolist()[:remainder]
    targetCaseId.extend(caseToAdd)

In [50]:
downloadFileDf = filesDf[filesDf["CaseID"].isin(targetCaseId)]

In [51]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
117,10003903,437500662.0,Front View,10003903_437500662.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10003...
119,10004377,437506049.0,Front View,10004377_437506049.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
135,10004779,437517431.0,Front View,10004779_437517431.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
176,10000483,437562613.0,Front View,10000483_437562613.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
372,10008793,437783094.0,Front View,10008793_437783094.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2879095,13552504,623729774.0,Rear View Right,13552504_623729774.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13552...
2879290,13165756,629543810.0,Rear View Right,13165756_629543810.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13165...
2879308,13465605,630958730.0,Rear View Right,13465605_630958730.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13465...
2879378,13490232,636651487.0,Rear View Right,13490232_636651487.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13490...


In [52]:
print(len(targetCaseId))

10000


In [53]:
sinkDir = "/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs"
# localFiles = os.listdir(sinkDir)
# downloadFileDf = downloadFileDf[~downloadFileDf["filename"].isin(localFiles)]


In [54]:
def transport_worker(df:pd.DataFrame):
    s = requests.Session()
    for _, i in tqdm(df.iterrows(), desc="files"):
        url = i["url"]
        imgBytes = s.get(url)
        localPath = os.path.join(sinkDir, i["filename"])
        with open(localPath, "wb") as f:
            f.write(imgBytes.content)

In [55]:

os.makedirs(sinkDir, exist_ok=True)
s = requests.Session()
batchTask = []
batchSize = 1000
for i in range(0, len(downloadFileDf), batchSize):
    batchTask.append(downloadFileDf.iloc[i : i + batchSize])
Parallel(n_jobs=10)(delayed(transport_worker)(taskDf) for taskDf in tqdm(batchTask, desc="tasks"))
# for _, i in tqdm(downloadFileDf.iterrows()):
#     url = i["url"]
#     imgBytes = s.get(url)
#     localPath = os.path.join(sinkDir, i["filename"])
#     with open(localPath, "wb") as f:
#         f.write(imgBytes.content)
    


files: 1000it [02:35,  6.44it/s]0:19<00:00, 56.35it/s]
files: 1000it [02:35,  6.44it/s]2:35<04:53,  9.16s/it]
files: 1000it [02:35,  6.42it/s]
files: 1000it [02:36,  6.38it/s]
files: 1000it [02:36,  6.39it/s]
files: 1000it [02:36,  6.38it/s]
files: 1000it [02:36,  6.38it/s]
files: 1000it [02:37,  6.36it/s]
files: 1000it [02:38,  6.32it/s]
files: 1000it [02:41,  6.19it/s]
files: 1000it [02:28,  6.75it/s]
files: 1000it [02:29,  6.67it/s]5:05<04:19, 11.80s/it]
files: 1000it [02:30,  6.66it/s]
files: 1000it [02:29,  6.69it/s]
files: 1000it [02:30,  6.65it/s]
files: 1000it [02:33,  6.51it/s]
files: 1000it [02:31,  6.61it/s]
files: 1000it [02:32,  6.54it/s]
files: 1000it [02:32,  6.56it/s]
files: 1000it [02:42,  6.14it/s]
files: 1000it [02:30,  6.65it/s]
files: 1000it [02:29,  6.68it/s]7:35<02:37, 13.09s/it]
files: 1000it [02:30,  6.66it/s]
files: 1000it [02:31,  6.62it/s]
files: 1000it [02:34,  6.49it/s]
files: 1000it [02:30,  6.66it/s]
files: 1000it [02:30,  6.66it/s]
files: 1000it [02:30,

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [56]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
117,10003903,437500662.0,Front View,10003903_437500662.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10003...
119,10004377,437506049.0,Front View,10004377_437506049.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
135,10004779,437517431.0,Front View,10004779_437517431.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
176,10000483,437562613.0,Front View,10000483_437562613.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
372,10008793,437783094.0,Front View,10008793_437783094.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2879095,13552504,623729774.0,Rear View Right,13552504_623729774.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13552...
2879290,13165756,629543810.0,Rear View Right,13165756_629543810.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13165...
2879308,13465605,630958730.0,Rear View Right,13465605_630958730.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13465...
2879378,13490232,636651487.0,Rear View Right,13490232_636651487.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13490...


In [57]:
downloadFileDf.to_parquet(f"../../data/imgs_metadata/{vehicleType}.parquet")

In [58]:
pd.read_parquet(f"/home/alextay96/Desktop/all_workspace/new_workspace/DLDataPipeline/data/imgs_metadata/{vehicleType}.parquet")

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
117,10003903,437500662.0,Front View,10003903_437500662.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10003...
119,10004377,437506049.0,Front View,10004377_437506049.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
135,10004779,437517431.0,Front View,10004779_437517431.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10004...
176,10000483,437562613.0,Front View,10000483_437562613.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10000...
372,10008793,437783094.0,Front View,10008793_437783094.JPG,http://192.168.1.4:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2879095,13552504,623729774.0,Rear View Right,13552504_623729774.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13552...
2879290,13165756,629543810.0,Rear View Right,13165756_629543810.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13165...
2879308,13465605,630958730.0,Rear View Right,13465605_630958730.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13465...
2879378,13490232,636651487.0,Rear View Right,13490232_636651487.JPG,http://192.168.1.4:8888/buckets/raw_imgs/13490...
