In [1]:

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import os
import random


In [2]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"


In [3]:
vehicleType = "SUV - 5 Dr"


In [4]:
bucket3 = "scope_case"
myFilter =  lambda x : x["Vehicle_Type"] == vehicleType
targetCols = ["CaseID", "Vehicle_Type"]
caseDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,partition_filter=myFilter,dataset=True, columns=targetCols)
caseDf

Unnamed: 0,CaseID,Vehicle_Type
0,10000004,Hatchback - 5 Dr
1,10000036,Hatchback - 5 Dr
2,10000042,Hatchback - 5 Dr
3,10000050,Hatchback - 5 Dr
4,10000068,Hatchback - 5 Dr
...,...,...
183822,13562877,Hatchback - 5 Dr
183823,13562891,Hatchback - 5 Dr
183824,13562893,Hatchback - 5 Dr
183825,13562976,Hatchback - 5 Dr


In [5]:
bucketName = "raw_imgs"
cli = boto3.client(
    "s3",
    **{
        "endpoint_url": "http://192.168.1.7:8333",
    }
)
paginator = cli.get_paginator("list_objects_v2")
operation_parameters = {"Bucket": bucketName}
page_iterator = paginator.paginate(**operation_parameters)
downloadedImgs = []
for page in tqdm(page_iterator):
    pageContent = page["Contents"]
    downloadedCaseId = set([int(x["Key"].split("_")[0]) for x in pageContent])
    downloadedImgs.extend(downloadedCaseId)
print(len(downloadedImgs))

44it [01:02,  1.43s/it]

87725





In [6]:
availableImgsCase = set(downloadedImgs).intersection(caseDf["CaseID"].unique().tolist())
downloadedCaseLength = len(availableImgsCase)

In [7]:
notDownloadedCaseLength = len(caseDf) - downloadedCaseLength

In [8]:
availableRatio = len(availableImgsCase) / len(caseDf)
availableRatio

0.17365240144266075

In [9]:
trackDf = pd.json_normalize([{
    "label" : "available",
    "cases" : downloadedCaseLength
},
                             {
    "label" : "not_available",
    "cases" : notDownloadedCaseLength
}
                             
                             
                             ])

In [10]:
import plotly.express as px

fig = px.pie(data_frame=trackDf, values="cases", labels="label")
fig.show()

In [11]:
bucket3 = "scope_file"
targetCols = ["CaseID", "iDOCID"]
filesDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,dataset=True, columns=targetCols)
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
0,10375210,456204471.0,Front View
1,10839776,479277541.0,Front View
2,10595863,518945875.0,Front View
3,10324518,453575575.0,Front View
4,10810372,477799896.0,Front View
...,...,...,...
2997463,12981985,651301700.0,Rear View Right
2997464,13548901,651304978.0,Rear View Right
2997465,13313567,652815210.0,Rear View Right
2997466,13551639,654542221.0,Rear View Right


In [12]:
filesDf = filesDf[filesDf["CaseID"].isin(availableImgsCase)]
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
2,10595863,518945875.0,Front View
99,10004002,437483424.0,Front View
137,10003125,437513895.0,Front View
251,10004204,437628283.0,Front View
312,10006884,437683694.0,Front View
...,...,...,...
2997154,12702538,624386744.0,Rear View Right
2997184,13554985,624837551.0,Rear View Right
2997416,13469224,637179003.0,Rear View Right
2997443,13177588,643593933.0,Rear View Right


In [13]:
filesDf["filename"] = filesDf[['CaseID', "iDOCID"]].apply(lambda x : str(int(x["CaseID"])) + "_" + str(int(x["iDOCID"])) + ".JPG", axis=1)
filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2          10595863_518945875.JPG
99         10004002_437483424.JPG
137        10003125_437513895.JPG
251        10004204_437628283.JPG
312        10006884_437683694.JPG
                    ...          
2997154    12702538_624386744.JPG
2997184    13554985_624837551.JPG
2997416    13469224_637179003.JPG
2997443    13177588_643593933.JPG
2997447    13444457_644455970.JPG
Name: filename, Length: 152138, dtype: object

In [14]:
endpoint = "http://192.168.1.7:8888/buckets/raw_imgs/"
filesDf["url"] = endpoint + filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
numCaseToDownload = 20000


In [16]:
len(filesDf['CaseID'].unique())
targetCaseId = filesDf['CaseID'].unique().tolist()
random.shuffle(targetCaseId)
targetCaseId = targetCaseId[:numCaseToDownload]
downloadFileDf = filesDf[filesDf["CaseID"].isin(targetCaseId)]

In [None]:
print(len(targetCaseId))

In [17]:
sinkDir = "/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs"
localFiles = os.listdir(sinkDir)
downloadFileDf = downloadFileDf[~downloadFileDf["filename"].isin(localFiles)]


In [18]:

os.makedirs(sinkDir, exist_ok=True)
s = requests.Session()

for _, i in tqdm(downloadFileDf.iterrows()):
    url = i["url"]
    imgBytes = s.get(url)
    localPath = os.path.join(sinkDir, i["filename"])
    with open(localPath, "wb") as f:
        f.write(imgBytes.content)
    


95326it [1:15:51, 20.95it/s]


In [19]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
312,10006884,437683694.0,Front View,10006884_437683694.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10006...
314,10007786,437684300.0,Front View,10007786_437684300.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10007...
409,10009599,437787494.0,Front View,10009599_437787494.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10009...
457,10010742,437831121.0,Front View,10010742_437831121.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10010...
854,10008478,438159522.0,Front View,10008478_438159522.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2997107,13514130,623705040.0,Rear View Right,13514130_623705040.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13514...
2997133,13559885,624070256.0,Rear View Right,13559885_624070256.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13559...
2997154,12702538,624386744.0,Rear View Right,12702538_624386744.JPG,http://192.168.1.7:8888/buckets/raw_imgs/12702...
2997184,13554985,624837551.0,Rear View Right,13554985_624837551.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13554...


In [20]:
downloadFileDf.to_parquet(f"../../data/imgs_metadata/{vehicleType}.parquet")

In [21]:
pd.read_parquet(f"/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs_metadata/{vehicleType}.parquet")

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
312,10006884,437683694.0,Front View,10006884_437683694.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10006...
314,10007786,437684300.0,Front View,10007786_437684300.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10007...
409,10009599,437787494.0,Front View,10009599_437787494.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10009...
457,10010742,437831121.0,Front View,10010742_437831121.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10010...
854,10008478,438159522.0,Front View,10008478_438159522.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2997107,13514130,623705040.0,Rear View Right,13514130_623705040.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13514...
2997133,13559885,624070256.0,Rear View Right,13559885_624070256.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13559...
2997154,12702538,624386744.0,Rear View Right,12702538_624386744.JPG,http://192.168.1.7:8888/buckets/raw_imgs/12702...
2997184,13554985,624837551.0,Rear View Right,13554985_624837551.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13554...
