# Labeling of data
using the Stanford_cars_type dataset & 60k images data set

## Imports

In [1]:
from os import listdir
import traceback
import pandas as pd 
import requests
import json
from datetime import datetime

## Import and overview of stanford_cars_type data

In [4]:
cars_type_path = "../../data/meta/stanford_cars_type.csv"
cars_type_df = pd.read_csv(cars_type_path, index_col=0)
cars_type_df

Unnamed: 0,file_name,car_code,car_name,brand,car_type,new_filename
0,00001.jpg,14,Audi TTS Coupe 2012,Audi,Coupe,00001_Audi TTS Coupe 2012.jpg
1,00002.jpg,3,Acura TL Sedan 2012,Acura,Sedan,00002_Acura TL Sedan 2012.jpg
2,00003.jpg,91,Dodge Dakota Club Cab 2007,Dodge,Cab,00003_Dodge Dakota Club Cab 2007.jpg
3,00004.jpg,134,Hyundai Sonata Hybrid Sedan 2012,Hyundai,Sedan,00004_Hyundai Sonata Hybrid Sedan 2012.jpg
4,00005.jpg,106,Ford F-450 Super Duty Crew Cab 2012,Ford,Cab,00005_Ford F-450 Super Duty Crew Cab 2012.jpg
...,...,...,...,...,...,...
8139,08140.jpg,78,Chrysler Town and Country Minivan 2012,Chrysler,Minivan,08140_Chrysler Town and Country Minivan 2012.jpg
8140,08141.jpg,196,smart fortwo Convertible 2012,smart,Convertible,08141_smart fortwo Convertible 2012.jpg
8141,08142.jpg,163,Mercedes-Benz SL-Class Coupe 2009,Mercedes-Benz,Coupe,08142_Mercedes-Benz SL-Class Coupe 2009.jpg
8142,08143.jpg,112,Ford GT Coupe 2006,Ford,Coupe,08143_Ford GT Coupe 2006.jpg


In [5]:
# drop unnessecary columns 
cars_type_cleaned_cols = cars_type_df.drop(["file_name", "car_code", "brand"], axis="columns")
cars_type_cleaned_cols

Unnamed: 0,car_name,car_type,new_filename
0,Audi TTS Coupe 2012,Coupe,00001_Audi TTS Coupe 2012.jpg
1,Acura TL Sedan 2012,Sedan,00002_Acura TL Sedan 2012.jpg
2,Dodge Dakota Club Cab 2007,Cab,00003_Dodge Dakota Club Cab 2007.jpg
3,Hyundai Sonata Hybrid Sedan 2012,Sedan,00004_Hyundai Sonata Hybrid Sedan 2012.jpg
4,Ford F-450 Super Duty Crew Cab 2012,Cab,00005_Ford F-450 Super Duty Crew Cab 2012.jpg
...,...,...,...
8139,Chrysler Town and Country Minivan 2012,Minivan,08140_Chrysler Town and Country Minivan 2012.jpg
8140,smart fortwo Convertible 2012,Convertible,08141_smart fortwo Convertible 2012.jpg
8141,Mercedes-Benz SL-Class Coupe 2009,Coupe,08142_Mercedes-Benz SL-Class Coupe 2009.jpg
8142,Ford GT Coupe 2006,Coupe,08143_Ford GT Coupe 2006.jpg


In [6]:
# count values of labels 
cars_type_cleaned_cols["car_type"].value_counts()

Sedan          1907
SUV            1437
Coupe          1088
Convertible    1036
Cab             719
Other           609
Hatchback       554
Wagon           291
Van             253
Minivan         250
Name: car_type, dtype: int64

## Overview of the 60k car images data

In [2]:
# define dataframe for data
img_60k_path = "../../data/images_2/"
img_60k_names = listdir(img_60k_path)

car_names = []
car_types = []
file_names = []

for i in img_60k_names: 
    split = i.split("_")
    car_names.append(f"{split[0]} {split[1]} {split[-2]} {split[2]}")
    car_types.append(split[-2])
    file_names.append(i)

origin_struct = {"car_name": car_names,
                 "car_type": car_types, 
                 "file_name": file_names}

img_60k_df_origin = pd.DataFrame(origin_struct)
img_60k_df_origin

Unnamed: 0,car_name,car_type,file_name
0,Acura ILX 4dr 2013,4dr,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...
1,Acura ILX 4dr 2013,4dr,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...
2,Acura ILX 4dr 2013,4dr,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...
3,Acura ILX 4dr 2013,4dr,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...
4,Acura ILX 4dr 2013,4dr,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...
...,...,...,...
64462,Volvo XC90 SUV 2020,SUV,Volvo_XC90_2020_50_19_250_20_4_79_69_194_18_AW...
64463,Volvo XC90 SUV 2020,SUV,Volvo_XC90_2020_50_19_250_20_4_79_69_194_18_AW...
64464,Volvo XC90 SUV 2020,SUV,Volvo_XC90_2020_50_19_250_20_4_79_69_194_18_AW...
64465,Volvo XC90 SUV 2020,SUV,Volvo_XC90_2020_50_19_250_20_4_79_69_194_18_AW...


In [3]:
img_60k_df_origin_path = "../../data/meta/img_60k_origin.csv"
img_60k_df_origin.to_csv(img_60k_df_origin_path)

In [9]:
# count values of labels in the origin dataset 
img_60k_df_origin["car_type"].value_counts()

4dr              21144
SUV              20297
Convertible       7547
2dr               6596
Pickup            6173
Van               1371
nan                733
Station Wagon      395
3dr                211
Name: car_type, dtype: int64

### Because there are many records in [4dr, 2dr, 3dr, Pickup] which are very general we will try to get more information about these records by querying an api 

In [12]:
# obj = list of responaw objects / dicts / json
def are_all_equal(obj: list) -> bool: 
    condition = obj[0]["model_body"]
    for i in obj:
        if i["model_body"] != condition: 
            return False
    return True

headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}

#storing new car_type in list 
new_body = []

#storing unclear responses in list
unclear_response = []

# if there are any errors while calling api, log them in list - see except block
excep_log = []

totals = len(img_60k_names)
done = 0

for name in img_60k_names: 
    split = name.split("_")

    # define the api call payload
    payload = {
        "make": split[0],
        "model": split[1],
        "year": split[2]
    }

    # just calling api for the mentioned labels
    request_conditions = ["4dr", "2dr", "3dr", "Pickup"]

    if split[-2] in request_conditions:
        try: 
            # calling the api with the created payload and http-header
            response = requests.get("https://www.carqueryapi.com/api/0.3/?callback=?&cmd=getTrims&", params=payload, headers=headers)
            res_text = response.text 
            # convert the json into a python dict
            converted_text = "".join(res_text.split('(', 1)[1].rsplit(')', 1)[0])
            response_dict = json.loads(converted_text)

            # check if the given label of the api can be clearly identfied- 
            # if it's not, safe it declare it as unclear response 
            if not are_all_equal(response_dict["Trims"]):
                unclear_response.append(name)
            else: 
                # save the response to its list. 
                # if the response is not empty 
                if len(response_dict["Trims"]) > 0:
                    new_body.append((name, response_dict["Trims"][0]["model_body"]))
                    done += 1
                    print(f"{done} of {totals} are successfully requested", end="\r")

                else:
                    unclear_response.append((name, "empty"))

        # if any critical error occures, log it to the log list 
        except Exception as e: 
                unclear_response.append((name, "traceback"))
                excep_log.append({"error": {"file": name, 
                                            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                            "exception": e, 
                                            "response": response_dict, 
                                            "response_length": len(response_dict), 
                                            "traceback": traceback.format_exc()
                                            }})
                continue

21312 of 64467 are successfully requested

## define and create dataframe from succesfully requested data 


In [23]:
names = [i[0] for i in new_body]
body = [i[1] for i in new_body]

struct = {"file_name": names, 
          "car_type": body}

df = pd.DataFrame(struct)
df


Unnamed: 0,file_name,car_type
0,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...,Sedan
1,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...,Sedan
2,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...,Sedan
3,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...,Sedan
4,Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD...,Sedan
...,...,...
21307,Volvo_V60_2020_45_18_250_20_4_72_56_187_23_FWD...,Small Station Wagons
21308,Volvo_V60_2020_45_18_250_20_4_72_56_187_23_FWD...,Small Station Wagons
21309,Volvo_V60_2020_45_18_250_20_4_72_56_187_23_FWD...,Small Station Wagons
21310,Volvo_V60_2020_45_18_250_20_4_72_56_187_23_FWD...,Small Station Wagons


##  print the counts of new labels 


In [22]:
df["car_type"].value_counts()

Midsize Cars                    4651
Sedan                           3183
Compact Cars                    2625
Pickup                          1651
Standard Pickup Trucks          1642
Large Cars                      1446
Subcompact Cars                 1436
Hatchback                       1346
Small Pickup Trucks              805
Two Seaters                      676
Coupe                            574
Small Station Wagons             536
Crossover                        413
Mini Compact Cars                200
SUV                               69
Small Sport Utility Vehicles      43
Not Available                     16
Name: car_type, dtype: int64

In [24]:
# save the new dataframe 
saving_path = "../../data/meta/new_labels.csv"
df.to_csv(saving_path, index=False)

## overview of api call failures 


In [26]:
len(unclear_response)
print(unclear_response)

[('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_aWh.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_Axe.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_CEp.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_FKp.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_kgn.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_kls.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_RbN.jpg', 'traceback'), ('Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_wnI.jpg', 'traceback'), ('Alfa Romeo_Giulia_2017_39_17_280_20_4_73_56_182_23_AWD_5_4_4dr_AjN.jpg', 'traceback'), ('Alfa Romeo_Giulia_2017_39_17_280_20_4_73_56_182_23_AWD_5_4_4dr_BYW.jpg', 'traceback'), ('Alfa Romeo_Giulia_2017_39_17_280_20_4_73_56_182_23_AWD_5_4_4dr_Bzl.jpg', 'traceback'), ('Alfa Romeo_Giulia_2017_39_17_280_20_4_73_56_182_23

In [27]:
excep_log
# according to the logs, all tracebacks had an empty response. Probably not in api available

[{'error': {'file': 'Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_aWh.jpg',
   'timestamp': '2023-05-28 11:10:07',
   'exception': IndexError('list index out of range'),
   'response': {'Trims': []},
   'response_length': 1,
   'traceback': 'Traceback (most recent call last):\n  File "C:\\Users\\Chris\\AppData\\Local\\Temp\\ipykernel_8668\\1707946236.py", line 41, in <module>\n    if not are_all_equal(response_dict["Trims"]):\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "C:\\Users\\Chris\\AppData\\Local\\Temp\\ipykernel_8668\\1707946236.py", line 3, in are_all_equal\n    condition = obj[0]["model_body"]\n                ~~~^^^\nIndexError: list index out of range\n'}},
 {'error': {'file': 'Alfa Romeo_4C_2017_55_17_230_17_4_73_46_157_24_RWD_2_2_2dr_Axe.jpg',
   'timestamp': '2023-05-28 11:10:07',
   'exception': IndexError('list index out of range'),
   'response': {'Trims': []},
   'response_length': 1,
   'traceback': 'Traceback (most recent call last):\n  F

## save failures to txt log file 


In [31]:
path_fails = "./failures.txt"
with open(path_fails, 'w') as fp:
    for item in unclear_response:
        item_str = " ".join(item)
        fp.write(item_str + "\n")
    print('Done')


Done
