In [22]:

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import os
import random


In [23]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"


In [24]:
vehicleType = "SUV - 5 Dr"


In [25]:
bucket3 = "scope_case"
myFilter =  lambda x : x["Vehicle_Type"] == vehicleType
targetCols = ["CaseID", "Vehicle_Type"]
caseDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,partition_filter=myFilter,dataset=True, columns=targetCols)
caseDf

Unnamed: 0,CaseID,Vehicle_Type
0,10000037,SUV - 5 Dr
1,10000064,SUV - 5 Dr
2,10000066,SUV - 5 Dr
3,10000117,SUV - 5 Dr
4,10000125,SUV - 5 Dr
...,...,...
67507,13562831,SUV - 5 Dr
67508,13562839,SUV - 5 Dr
67509,13562932,SUV - 5 Dr
67510,13562968,SUV - 5 Dr


In [26]:
bucketName = "raw_imgs"
cli = boto3.client(
    "s3",
    **{
        "endpoint_url": "http://192.168.1.7:8333",
    }
)
paginator = cli.get_paginator("list_objects_v2")
operation_parameters = {"Bucket": bucketName}
page_iterator = paginator.paginate(**operation_parameters)
downloadedImgs = []
for page in tqdm(page_iterator):
    pageContent = page["Contents"]
    downloadedCaseId = set([int(x["Key"].split("_")[0]) for x in pageContent])
    downloadedImgs.extend(downloadedCaseId)
print(len(downloadedImgs))

48it [00:56,  1.17s/it]

95293





In [27]:
availableImgsCase = set(downloadedImgs).intersection(caseDf["CaseID"].unique().tolist())
downloadedCaseLength = len(availableImgsCase)

In [28]:
notDownloadedCaseLength = len(caseDf) - downloadedCaseLength

In [29]:
availableRatio = len(availableImgsCase) / len(caseDf)
availableRatio

0.2175316980684915

In [30]:
trackDf = pd.json_normalize([{
    "label" : "available",
    "cases" : downloadedCaseLength
},
                             {
    "label" : "not_available",
    "cases" : notDownloadedCaseLength
}
                             
                             
                             ])

In [31]:
import plotly.express as px

fig = px.pie(data_frame=trackDf, values="cases", labels="label")
fig.show()

In [32]:
bucket3 = "scope_file"
targetCols = ["CaseID", "iDOCID"]
filesDf = wr.s3.read_parquet(f"s3://{bucket3}/" ,dataset=True, columns=targetCols)
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
0,10375210,456204471.0,Front View
1,10839776,479277541.0,Front View
2,10595863,518945875.0,Front View
3,10324518,453575575.0,Front View
4,10810372,477799896.0,Front View
...,...,...,...
2997463,12981985,651301700.0,Rear View Right
2997464,13548901,651304978.0,Rear View Right
2997465,13313567,652815210.0,Rear View Right
2997466,13551639,654542221.0,Rear View Right


In [33]:
filesDf = filesDf[filesDf["CaseID"].isin(availableImgsCase)]
filesDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc
26,10001668,437375090.0,Front View
92,10002757,437468727.0,Front View
128,10004377,437506049.0,Front View
131,10004317,437508407.0,Front View
219,10006100,437605556.0,Front View
...,...,...,...
2997261,13532795,627137277.0,Rear View Right
2997263,13549423,627156658.0,Rear View Right
2997319,13165756,629543810.0,Rear View Right
2997327,13396834,630249507.0,Rear View Right


In [34]:
filesDf["filename"] = filesDf[['CaseID', "iDOCID"]].apply(lambda x : str(int(x["CaseID"])) + "_" + str(int(x["iDOCID"])) + ".JPG", axis=1)
filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



26         10001668_437375090.JPG
92         10002757_437468727.JPG
128        10004377_437506049.JPG
131        10004317_437508407.JPG
219        10006100_437605556.JPG
                    ...          
2997261    13532795_627137277.JPG
2997263    13549423_627156658.JPG
2997319    13165756_629543810.JPG
2997327    13396834_630249507.JPG
2997448    13367788_645189473.JPG
Name: filename, Length: 76714, dtype: object

In [35]:
endpoint = "http://192.168.1.7:8888/buckets/raw_imgs/"
filesDf["url"] = endpoint + filesDf["filename"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
numCaseToDownload = 20000


In [37]:
len(filesDf['CaseID'].unique())
targetCaseId = filesDf['CaseID'].unique().tolist()
random.shuffle(targetCaseId)
targetCaseId = targetCaseId[:numCaseToDownload]
downloadFileDf = filesDf[filesDf["CaseID"].isin(targetCaseId)]

In [38]:
print(len(targetCaseId))

14686


In [39]:
sinkDir = "/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs"
localFiles = os.listdir(sinkDir)
downloadFileDf = downloadFileDf[~downloadFileDf["filename"].isin(localFiles)]


In [40]:

os.makedirs(sinkDir, exist_ok=True)
s = requests.Session()

for _, i in tqdm(downloadFileDf.iterrows()):
    url = i["url"]
    imgBytes = s.get(url)
    localPath = os.path.join(sinkDir, i["filename"])
    with open(localPath, "wb") as f:
        f.write(imgBytes.content)
    


4295it [02:10, 32.96it/s]


KeyboardInterrupt: 

In [None]:
downloadFileDf

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
312,10006884,437683694.0,Front View,10006884_437683694.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10006...
314,10007786,437684300.0,Front View,10007786_437684300.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10007...
409,10009599,437787494.0,Front View,10009599_437787494.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10009...
457,10010742,437831121.0,Front View,10010742_437831121.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10010...
854,10008478,438159522.0,Front View,10008478_438159522.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2997107,13514130,623705040.0,Rear View Right,13514130_623705040.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13514...
2997133,13559885,624070256.0,Rear View Right,13559885_624070256.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13559...
2997154,12702538,624386744.0,Rear View Right,12702538_624386744.JPG,http://192.168.1.7:8888/buckets/raw_imgs/12702...
2997184,13554985,624837551.0,Rear View Right,13554985_624837551.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13554...


In [None]:
downloadFileDf.to_parquet(f"../../data/imgs_metadata/{vehicleType}.parquet")

In [None]:
pd.read_parquet(f"/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/imgs_metadata/{vehicleType}.parquet")

Unnamed: 0,CaseID,iDOCID,StdDocDesc,filename,url
312,10006884,437683694.0,Front View,10006884_437683694.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10006...
314,10007786,437684300.0,Front View,10007786_437684300.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10007...
409,10009599,437787494.0,Front View,10009599_437787494.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10009...
457,10010742,437831121.0,Front View,10010742_437831121.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10010...
854,10008478,438159522.0,Front View,10008478_438159522.JPG,http://192.168.1.7:8888/buckets/raw_imgs/10008...
...,...,...,...,...,...
2997107,13514130,623705040.0,Rear View Right,13514130_623705040.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13514...
2997133,13559885,624070256.0,Rear View Right,13559885_624070256.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13559...
2997154,12702538,624386744.0,Rear View Right,12702538_624386744.JPG,http://192.168.1.7:8888/buckets/raw_imgs/12702...
2997184,13554985,624837551.0,Rear View Right,13554985_624837551.JPG,http://192.168.1.7:8888/buckets/raw_imgs/13554...
