In [1]:

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import os


In [2]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"


In [17]:
vehicleType = "Saloon - 4 Dr"


In [3]:
bucket3 = "scope_case"
myFilter =  lambda x : x["Vehicle_Type"] == vehicleType
targetCols = ["CaseID", "Vehicle_Type"]
caseDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,partition_filter=myFilter,dataset=True, columns=targetCols)
caseDf

Unnamed: 0,CaseID,Vehicle_Type
0,10000008,Saloon - 4 Dr
1,10000016,Saloon - 4 Dr
2,10000030,Saloon - 4 Dr
3,10000038,Saloon - 4 Dr
4,10000063,Saloon - 4 Dr
...,...,...
264668,13562973,Saloon - 4 Dr
264669,13563018,Saloon - 4 Dr
264670,13563063,Saloon - 4 Dr
264671,13563071,Saloon - 4 Dr


In [4]:
bucketName = "raw_imgs"
cli = boto3.client(
    "s3",
    **{
        "endpoint_url": "http://192.168.1.7:8333",
    }
)
paginator = cli.get_paginator("list_objects_v2")
operation_parameters = {"Bucket": bucketName}
page_iterator = paginator.paginate(**operation_parameters)
downloadedImgs = []
for page in tqdm(page_iterator):
    pageContent = page["Contents"]
    downloadedCaseId = set([int(x["Key"].split("_")[0]) for x in pageContent])
    downloadedImgs.extend(downloadedCaseId)
print(len(downloadedImgs))

36it [00:36,  1.00s/it]

71598





In [5]:
availableImgsCase = set(downloadedImgs).intersection(caseDf["CaseID"].unique().tolist())
downloadedCaseLength = len(availableImgsCase)

In [6]:
notDownloadedCaseLength = len(caseDf) - downloadedCaseLength

In [7]:
availableRatio = len(availableImgsCase) / len(caseDf)
availableRatio

0.1551688309725586

In [8]:
trackDf = pd.json_normalize([{
    "label" : "available",
    "cases" : downloadedCaseLength
},
                             {
    "label" : "not_available",
    "cases" : notDownloadedCaseLength
}
                             
                             
                             ])

In [9]:
import plotly.express as px

fig = px.pie(data_frame=trackDf, values="cases", labels="label")
fig.show()

In [10]:
bucket3 = "scope_file"
targetCols = ["CaseID", "iDOCID"]
filesDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,dataset=True, columns=targetCols)
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
0,10375210,456204471.0,Front View
1,10839776,479277541.0,Front View
2,10595863,518945875.0,Front View
3,10324518,453575575.0,Front View
4,10810372,477799896.0,Front View
...,...,...,...
2997463,12981985,651301700.0,Rear View Right
2997464,13548901,651304978.0,Rear View Right
2997465,13313567,652815210.0,Rear View Right
2997466,13551639,654542221.0,Rear View Right


In [11]:
filesDf = filesDf[filesDf["CaseID"].isin(availableImgsCase)]
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
6,10000008,437283237.0,Front View
10,10000123,437299477.0,Front View
11,10000358,437300189.0,Front View
12,10000449,437304372.0,Front View
15,10000450,437321843.0,Front View
...,...,...,...
2910541,13074116,595377127.0,Rear View Right
2910882,13067177,595451910.0,Rear View Right
2912183,13079952,595854084.0,Rear View Right
2914371,13068882,596511595.0,Rear View Right


In [12]:
filesDf["filename"] = filesDf[['CaseID', "iDOCID"]].apply(lambda x : str(int(x["CaseID"])) + "_" + str(int(x["iDOCID"])) + ".JPG", axis=1)
filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



6          10000008_437283237.JPG
10         10000123_437299477.JPG
11         10000358_437300189.JPG
12         10000449_437304372.JPG
15         10000450_437321843.JPG
                    ...          
2910541    13074116_595377127.JPG
2910882    13067177_595451910.JPG
2912183    13079952_595854084.JPG
2914371    13068882_596511595.JPG
2933771    12913298_602508321.JPG
Name: filename, Length: 201451, dtype: object

In [13]:
endpoint = "http://192.168.1.7:8888/buckets/raw_imgs/"
filesDf["url"] = endpoint + filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
import random
len(filesDf['CaseID'].unique())
numCaseToDownload = 20000
targetCaseId = filesDf['CaseID'].unique().tolist()
random.shuffle(targetCaseId)
targetCaseId = targetCaseId[:numCaseToDownload]
downloadFileDf = filesDf[filesDf["CaseID"].isin(targetCaseId)]

In [15]:


sinkDir = "/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs"
os.makedirs(sinkDir, exist_ok=True)
s = requests.Session()

for _, i in tqdm(downloadFileDf.iterrows()):
    url = i["url"]
    imgBytes = s.get(url)
    localPath = os.path.join(sinkDir, i["filename"])
    with open(localPath, "wb") as f:
        f.write(imgBytes.content)
    


98121it [38:54, 42.03it/s]


In [18]:
downloadFileDf.to_parquet(f"../../data/imgs_metadata/{vehicleType}.parquet")