# Labeling of data
using the Stanford_cars_type dataset & 60k images data set

## Imports

In [1]:
from os import listdir
import traceback
import pandas as pd 
import requests
import json
from datetime import datetime
from scipy.io import loadmat
import pickle

## Import and overview of stanford_cars_type data

### Train data 

In [2]:

cars_type_path = "../../data/meta/stanford_cars_type.csv"
cars_type_df = pd.read_csv(cars_type_path, index_col=0)
cars_type_df

Unnamed: 0,file_name,car_code,car_name,brand,car_type,new_filename
0,00001.jpg,14,Audi TTS Coupe 2012,Audi,Coupe,00001_Audi TTS Coupe 2012.jpg
1,00002.jpg,3,Acura TL Sedan 2012,Acura,Sedan,00002_Acura TL Sedan 2012.jpg
2,00003.jpg,91,Dodge Dakota Club Cab 2007,Dodge,Cab,00003_Dodge Dakota Club Cab 2007.jpg
3,00004.jpg,134,Hyundai Sonata Hybrid Sedan 2012,Hyundai,Sedan,00004_Hyundai Sonata Hybrid Sedan 2012.jpg
4,00005.jpg,106,Ford F-450 Super Duty Crew Cab 2012,Ford,Cab,00005_Ford F-450 Super Duty Crew Cab 2012.jpg
...,...,...,...,...,...,...
8139,08140.jpg,78,Chrysler Town and Country Minivan 2012,Chrysler,Minivan,08140_Chrysler Town and Country Minivan 2012.jpg
8140,08141.jpg,196,smart fortwo Convertible 2012,smart,Convertible,08141_smart fortwo Convertible 2012.jpg
8141,08142.jpg,163,Mercedes-Benz SL-Class Coupe 2009,Mercedes-Benz,Coupe,08142_Mercedes-Benz SL-Class Coupe 2009.jpg
8142,08143.jpg,112,Ford GT Coupe 2006,Ford,Coupe,08143_Ford GT Coupe 2006.jpg


In [3]:
# drop unnessecary columns 
cars_type_cleaned_cols = cars_type_df.drop(["file_name", "car_code", "brand"], axis="columns")
cars_type_cleaned_cols

Unnamed: 0,car_name,car_type,new_filename
0,Audi TTS Coupe 2012,Coupe,00001_Audi TTS Coupe 2012.jpg
1,Acura TL Sedan 2012,Sedan,00002_Acura TL Sedan 2012.jpg
2,Dodge Dakota Club Cab 2007,Cab,00003_Dodge Dakota Club Cab 2007.jpg
3,Hyundai Sonata Hybrid Sedan 2012,Sedan,00004_Hyundai Sonata Hybrid Sedan 2012.jpg
4,Ford F-450 Super Duty Crew Cab 2012,Cab,00005_Ford F-450 Super Duty Crew Cab 2012.jpg
...,...,...,...
8139,Chrysler Town and Country Minivan 2012,Minivan,08140_Chrysler Town and Country Minivan 2012.jpg
8140,smart fortwo Convertible 2012,Convertible,08141_smart fortwo Convertible 2012.jpg
8141,Mercedes-Benz SL-Class Coupe 2009,Coupe,08142_Mercedes-Benz SL-Class Coupe 2009.jpg
8142,Ford GT Coupe 2006,Coupe,08143_Ford GT Coupe 2006.jpg


In [4]:
# count values of labels 
cars_type_cleaned_cols["car_type"].value_counts()

Sedan          1907
SUV            1437
Coupe          1088
Convertible    1036
Cab             719
Other           609
Hatchback       554
Wagon           291
Van             253
Minivan         250
Name: car_type, dtype: int64

### test data

we'll extracting the test data of the original stanford dataset

In [5]:
stanford_annos_path = "../../data/meta/cars_annos.mat"
raw_annos = loadmat(stanford_annos_path)
raw_annos

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Feb 28 19:34:55 2015',
 '__version__': '1.0',
 '__globals__': [],
 'annotations': array([[(array(['car_ims/000001.jpg'], dtype='<U18'), array([[112]], dtype=uint8), array([[7]], dtype=uint8), array([[853]], dtype=uint16), array([[717]], dtype=uint16), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
         (array(['car_ims/000002.jpg'], dtype='<U18'), array([[48]], dtype=uint8), array([[24]], dtype=uint8), array([[441]], dtype=uint16), array([[202]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
         (array(['car_ims/000003.jpg'], dtype='<U18'), array([[7]], dtype=uint8), array([[4]], dtype=uint8), array([[277]], dtype=uint16), array([[180]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
         ...,
         (array(['car_ims/016183.jpg'], dtype='<U18'), array([[25]], dtype=uint8), array([[32]], dtype=uint8), array([[587]], dtype=uint16), array([[359]], dt

In [6]:
# just store the annotations key
annos = raw_annos["annotations"]
annos 


array([[(array(['car_ims/000001.jpg'], dtype='<U18'), array([[112]], dtype=uint8), array([[7]], dtype=uint8), array([[853]], dtype=uint16), array([[717]], dtype=uint16), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        (array(['car_ims/000002.jpg'], dtype='<U18'), array([[48]], dtype=uint8), array([[24]], dtype=uint8), array([[441]], dtype=uint16), array([[202]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        (array(['car_ims/000003.jpg'], dtype='<U18'), array([[7]], dtype=uint8), array([[4]], dtype=uint8), array([[277]], dtype=uint16), array([[180]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        ...,
        (array(['car_ims/016183.jpg'], dtype='<U18'), array([[25]], dtype=uint8), array([[32]], dtype=uint8), array([[587]], dtype=uint16), array([[359]], dtype=uint16), array([[196]], dtype=uint8), array([[1]], dtype=uint8)),
        (array(['car_ims/016184.jpg'], dtype='<U18'), array([[56]], dtype=uint8), array([

#### because the structure of the data we can simply use list comprehension to extraxt the dats 

In [7]:
names = [i[0][0] for i in annos[0]] # relative_in_path
classes = [i[5][0][0] for i in annos[0]]
is_test =  [i[6][0][0] for i in annos[0]]

df_struct = {"name": names, 
             "class_id": classes, 
             "is_test": is_test}

whole_stanford_df = pd.DataFrame(df_struct)
whole_stanford_df

Unnamed: 0,name,class_id,is_test
0,car_ims/000001.jpg,1,0
1,car_ims/000002.jpg,1,0
2,car_ims/000003.jpg,1,0
3,car_ims/000004.jpg,1,0
4,car_ims/000005.jpg,1,0
...,...,...,...
16180,car_ims/016181.jpg,196,1
16181,car_ims/016182.jpg,196,1
16182,car_ims/016183.jpg,196,1
16183,car_ims/016184.jpg,196,1


#### filtering by test data


In [8]:
stanford_test_df = whole_stanford_df[whole_stanford_df["is_test"] == 1]
stanford_test_df

Unnamed: 0,name,class_id,is_test
45,car_ims/000046.jpg,1,1
46,car_ims/000047.jpg,1,1
47,car_ims/000048.jpg,1,1
48,car_ims/000049.jpg,1,1
49,car_ims/000050.jpg,1,1
...,...,...,...
16180,car_ims/016181.jpg,196,1
16181,car_ims/016182.jpg,196,1
16182,car_ims/016183.jpg,196,1
16183,car_ims/016184.jpg,196,1


#### load the car names for each class - meta data of stanford


In [9]:
test = "../../data/meta/cars_meta.mat"
stanford_meta_struct = loadmat(test)
stanford_meta_struct

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Dec 14 14:13:07 2013',
 '__version__': '1.0',
 '__globals__': [],
 'class_names': array([[array(['AM General Hummer SUV 2000'], dtype='<U26'),
         array(['Acura RL Sedan 2012'], dtype='<U19'),
         array(['Acura TL Sedan 2012'], dtype='<U19'),
         array(['Acura TL Type-S 2008'], dtype='<U20'),
         array(['Acura TSX Sedan 2012'], dtype='<U20'),
         array(['Acura Integra Type R 2001'], dtype='<U25'),
         array(['Acura ZDX Hatchback 2012'], dtype='<U24'),
         array(['Aston Martin V8 Vantage Convertible 2012'], dtype='<U40'),
         array(['Aston Martin V8 Vantage Coupe 2012'], dtype='<U34'),
         array(['Aston Martin Virage Convertible 2012'], dtype='<U36'),
         array(['Aston Martin Virage Coupe 2012'], dtype='<U30'),
         array(['Audi RS 4 Convertible 2008'], dtype='<U26'),
         array(['Audi A5 Coupe 2012'], dtype='<U18'),
         array(['Audi TTS Coupe 2012'], d

#### extract class names from meta


In [10]:
class_names = stanford_meta_struct["class_names"]
classes = [i[0] for i in class_names[0]]
classes

['AM General Hummer SUV 2000',
 'Acura RL Sedan 2012',
 'Acura TL Sedan 2012',
 'Acura TL Type-S 2008',
 'Acura TSX Sedan 2012',
 'Acura Integra Type R 2001',
 'Acura ZDX Hatchback 2012',
 'Aston Martin V8 Vantage Convertible 2012',
 'Aston Martin V8 Vantage Coupe 2012',
 'Aston Martin Virage Convertible 2012',
 'Aston Martin Virage Coupe 2012',
 'Audi RS 4 Convertible 2008',
 'Audi A5 Coupe 2012',
 'Audi TTS Coupe 2012',
 'Audi R8 Coupe 2012',
 'Audi V8 Sedan 1994',
 'Audi 100 Sedan 1994',
 'Audi 100 Wagon 1994',
 'Audi TT Hatchback 2011',
 'Audi S6 Sedan 2011',
 'Audi S5 Convertible 2012',
 'Audi S5 Coupe 2012',
 'Audi S4 Sedan 2012',
 'Audi S4 Sedan 2007',
 'Audi TT RS Coupe 2012',
 'BMW ActiveHybrid 5 Sedan 2012',
 'BMW 1 Series Convertible 2012',
 'BMW 1 Series Coupe 2012',
 'BMW 3 Series Sedan 2012',
 'BMW 3 Series Wagon 2012',
 'BMW 6 Series Convertible 2007',
 'BMW X5 SUV 2007',
 'BMW X6 SUV 2012',
 'BMW M3 Coupe 2012',
 'BMW M5 Sedan 2010',
 'BMW M6 Convertible 2010',
 'BMW X3

#### create meta dataframe for class names 

In [11]:
stanford_meta_df_struct = {"car_name": classes}
stanford_meta_df = pd.DataFrame(stanford_meta_df_struct)
stanford_meta_df

Unnamed: 0,car_name
0,AM General Hummer SUV 2000
1,Acura RL Sedan 2012
2,Acura TL Sedan 2012
3,Acura TL Type-S 2008
4,Acura TSX Sedan 2012
...,...
191,Volkswagen Beetle Hatchback 2012
192,Volvo C30 Hatchback 2012
193,Volvo 240 Sedan 1993
194,Volvo XC90 SUV 2007


#### restructre meta_df for mapping


In [12]:
stanford_meta_df.index = range(1, 197)
stanford_meta_df

Unnamed: 0,car_name
1,AM General Hummer SUV 2000
2,Acura RL Sedan 2012
3,Acura TL Sedan 2012
4,Acura TL Type-S 2008
5,Acura TSX Sedan 2012
...,...
192,Volkswagen Beetle Hatchback 2012
193,Volvo C30 Hatchback 2012
194,Volvo 240 Sedan 1993
195,Volvo XC90 SUV 2007


#### map test data with meta to get banes

In [13]:
stanford_test_df["car_name"] = stanford_test_df["class_id"].map(stanford_meta_df["car_name"])
stanford_test_df.drop("class_id", axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stanford_test_df["car_name"] = stanford_test_df["class_id"].map(stanford_meta_df["car_name"])


Unnamed: 0,name,is_test,car_name
45,car_ims/000046.jpg,1,AM General Hummer SUV 2000
46,car_ims/000047.jpg,1,AM General Hummer SUV 2000
47,car_ims/000048.jpg,1,AM General Hummer SUV 2000
48,car_ims/000049.jpg,1,AM General Hummer SUV 2000
49,car_ims/000050.jpg,1,AM General Hummer SUV 2000
...,...,...,...
16180,car_ims/016181.jpg,1,smart fortwo Convertible 2012
16181,car_ims/016182.jpg,1,smart fortwo Convertible 2012
16182,car_ims/016183.jpg,1,smart fortwo Convertible 2012
16183,car_ims/016184.jpg,1,smart fortwo Convertible 2012


#### Using api to get labels

In [14]:
# store the class names 
# obj = list of responaw objects / dicts / json
def are_all_equal(obj: list) -> bool: 
    condition = obj[0]["model_body"]
    for i in obj:
        if i["model_body"] != condition: 
            return False
    return True

headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}

#storing new car_type in list 
new_body = []

#storing unclear responses in list
unclear_response = []

# if there are any errors while calling api, log them in list - see except block
excep_log = []

totals = len(classes)
done = 0

for name in classes: 
    split = name.split(" ")

    # define the api call payload
    payload = {
        "make": split[0],
        "model": split[1],
        "year": split[-1]
    }

    # just calling api for the mentioned labels
    # request_conditions = ["4dr", "2dr", "3dr", "Pickup"]
    try: 
        # calling the api with the created payload and http-header
        response = requests.get("https://www.carqueryapi.com/api/0.3/?callback=?&cmd=getTrims&", params=payload, headers=headers)
        res_text = response.text 
        # convert the json into a python dict
        converted_text = "".join(res_text.split('(', 1)[1].rsplit(')', 1)[0])
        response_dict = json.loads(converted_text)

        # check if the given label of the api can be clearly identfied- 
        # if it's not, safe it declare it as unclear response 
        if not are_all_equal(response_dict["Trims"]):
            unclear_response.append(name)
        else: 
            # save the response to its list. 
            # if the response is not empty 
            if len(response_dict["Trims"]) > 0:
                new_body.append((name, response_dict["Trims"][0]["model_body"]))
                done += 1
                print(f"{done} of {totals} are successfully requested", end="\r")

            else:
                unclear_response.append((name, "empty"))

    # if any critical error occures, log it to the log list 
    except Exception as e: 
            unclear_response.append((name, "traceback"))
            excep_log.append({"error": {"file": name, 
                                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                        "exception": e, 
                                        "response": response_dict, 
                                        "response_length": len(response_dict), 
                                        "traceback": traceback.format_exc()
                                        }})
            continue

118 of 196 are successfully requested

In [15]:
new_body

[('Acura RL Sedan 2012', 'Sedan'),
 ('Acura TL Sedan 2012', 'Sedan'),
 ('Acura TL Type-S 2008', 'Sedan'),
 ('Acura ZDX Hatchback 2012', 'Crossover'),
 ('Audi V8 Sedan 1994', 'Sedan'),
 ('Audi 100 Sedan 1994', None),
 ('Audi 100 Wagon 1994', None),
 ('Audi S6 Sedan 2011', 'Sedan'),
 ('BMW X5 SUV 2007', 'SUV'),
 ('BMW X6 SUV 2012', 'SUV'),
 ('BMW M5 Sedan 2010', 'Sedan'),
 ('BMW M6 Convertible 2010', 'Coupe'),
 ('BMW X3 SUV 2012', 'SUV'),
 ('BMW Z4 Convertible 2012', 'Convertible'),
 ('Bentley Arnage Sedan 2009', 'Sedan'),
 ('Bentley Mulsanne Sedan 2011', 'Sedan'),
 ('Bentley Continental GT Coupe 2007', 'Coupe'),
 ('Bentley Continental Flying Spur Sedan 2007', 'Coupe'),
 ('Bugatti Veyron 16.4 Convertible 2009', 'Convertible'),
 ('Bugatti Veyron 16.4 Coupe 2009', 'Convertible'),
 ('Buick Regal GS 2012', 'Sedan'),
 ('Buick Rainier SUV 2007', 'SUV'),
 ('Buick Verano Sedan 2012', 'Sedan'),
 ('Buick Enclave SUV 2012', 'SUV'),
 ('Cadillac SRX SUV 2012', 'SUV'),
 ('Cadillac Escalade EXT Crew Ca

#### overview of failures 

In [16]:
len(excep_log)

42

In [17]:
excep_log

[{'error': {'file': 'AM General Hummer SUV 2000',
   'timestamp': '2023-06-01 18:44:52',
   'exception': IndexError('list index out of range'),
   'response': {'Trims': []},
   'response_length': 1,
   'traceback': 'Traceback (most recent call last):\n  File "C:\\Users\\Chris\\AppData\\Local\\Temp\\ipykernel_19868\\122236227.py", line 46, in <module>\n    if not are_all_equal(response_dict["Trims"]):\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "C:\\Users\\Chris\\AppData\\Local\\Temp\\ipykernel_19868\\122236227.py", line 4, in are_all_equal\n    condition = obj[0]["model_body"]\n                ~~~^^^\nIndexError: list index out of range\n'}},
 {'error': {'file': 'Aston Martin V8 Vantage Convertible 2012',
   'timestamp': '2023-06-01 18:44:55',
   'exception': IndexError('list index out of range'),
   'response': {'Trims': []},
   'response_length': 1,
   'traceback': 'Traceback (most recent call last):\n  File "C:\\Users\\Chris\\AppData\\Local\\Temp\\ipykernel_19868\\1222

In [18]:
# list car names of failures 
test_failure_names = []
for err in excep_log: 
    item = err["error"]
    test_failure_names.append((item["file"], ""))

test_failure_names
    

[('AM General Hummer SUV 2000', ''),
 ('Aston Martin V8 Vantage Convertible 2012', ''),
 ('Aston Martin V8 Vantage Coupe 2012', ''),
 ('Aston Martin Virage Convertible 2012', ''),
 ('Aston Martin Virage Coupe 2012', ''),
 ('Audi RS 4 Convertible 2008', ''),
 ('Audi TTS Coupe 2012', ''),
 ('BMW ActiveHybrid 5 Sedan 2012', ''),
 ('BMW 1 Series Convertible 2012', ''),
 ('BMW 1 Series Coupe 2012', ''),
 ('BMW 3 Series Sedan 2012', ''),
 ('BMW 3 Series Wagon 2012', ''),
 ('BMW 6 Series Convertible 2007', ''),
 ('Bentley Continental Supersports Conv. Convertible 2012', ''),
 ('Bentley Continental GT Coupe 2012', ''),
 ('Chevrolet Monte Carlo Coupe 2007', ''),
 ('Chrysler Town and Country Minivan 2012', ''),
 ('Chrysler PT Cruiser Convertible 2008', ''),
 ('Ferrari 458 Italia Convertible 2012', ''),
 ('Ferrari 458 Italia Coupe 2012', ''),
 ('Ford E-Series Wagon Van 2012', ''),
 ('Geo Metro Convertible 1993', ''),
 ('HUMMER H3T Crew Cab 2010', ''),
 ('Hyundai Santa Fe SUV 2012', ''),
 ('Infini

#### extract body type of failures

In [19]:
failure_test_bodies = []
for name in test_failure_names: 
    split = name[0].split(" ")
    body = split[-2]
    failure_test_bodies.append((name[0], body))

failure_test_bodies

[('AM General Hummer SUV 2000', 'SUV'),
 ('Aston Martin V8 Vantage Convertible 2012', 'Convertible'),
 ('Aston Martin V8 Vantage Coupe 2012', 'Coupe'),
 ('Aston Martin Virage Convertible 2012', 'Convertible'),
 ('Aston Martin Virage Coupe 2012', 'Coupe'),
 ('Audi RS 4 Convertible 2008', 'Convertible'),
 ('Audi TTS Coupe 2012', 'Coupe'),
 ('BMW ActiveHybrid 5 Sedan 2012', 'Sedan'),
 ('BMW 1 Series Convertible 2012', 'Convertible'),
 ('BMW 1 Series Coupe 2012', 'Coupe'),
 ('BMW 3 Series Sedan 2012', 'Sedan'),
 ('BMW 3 Series Wagon 2012', 'Wagon'),
 ('BMW 6 Series Convertible 2007', 'Convertible'),
 ('Bentley Continental Supersports Conv. Convertible 2012', 'Convertible'),
 ('Bentley Continental GT Coupe 2012', 'Coupe'),
 ('Chevrolet Monte Carlo Coupe 2007', 'Coupe'),
 ('Chrysler Town and Country Minivan 2012', 'Minivan'),
 ('Chrysler PT Cruiser Convertible 2008', 'Convertible'),
 ('Ferrari 458 Italia Convertible 2012', 'Convertible'),
 ('Ferrari 458 Italia Coupe 2012', 'Coupe'),
 ('Ford 

In [20]:
class_labels = failure_test_bodies + new_body
class_labels

[('AM General Hummer SUV 2000', 'SUV'),
 ('Aston Martin V8 Vantage Convertible 2012', 'Convertible'),
 ('Aston Martin V8 Vantage Coupe 2012', 'Coupe'),
 ('Aston Martin Virage Convertible 2012', 'Convertible'),
 ('Aston Martin Virage Coupe 2012', 'Coupe'),
 ('Audi RS 4 Convertible 2008', 'Convertible'),
 ('Audi TTS Coupe 2012', 'Coupe'),
 ('BMW ActiveHybrid 5 Sedan 2012', 'Sedan'),
 ('BMW 1 Series Convertible 2012', 'Convertible'),
 ('BMW 1 Series Coupe 2012', 'Coupe'),
 ('BMW 3 Series Sedan 2012', 'Sedan'),
 ('BMW 3 Series Wagon 2012', 'Wagon'),
 ('BMW 6 Series Convertible 2007', 'Convertible'),
 ('Bentley Continental Supersports Conv. Convertible 2012', 'Convertible'),
 ('Bentley Continental GT Coupe 2012', 'Coupe'),
 ('Chevrolet Monte Carlo Coupe 2007', 'Coupe'),
 ('Chrysler Town and Country Minivan 2012', 'Minivan'),
 ('Chrysler PT Cruiser Convertible 2008', 'Convertible'),
 ('Ferrari 458 Italia Convertible 2012', 'Convertible'),
 ('Ferrari 458 Italia Coupe 2012', 'Coupe'),
 ('Ford 

In [21]:
# unclear responses without traceback error
filtered_unclear_response = []
not_availables = []
na_unclears = []
condition = cars_type_cleaned_cols["car_type"].unique()
for name in unclear_response: 
    if type(name) is tuple: 
        continue
    else: 
        split = name.split(" ")
        body = split[-2]
        if body not in condition: 
            na_unclears.append(name)
        else: 
            not_availables.append((name, body))

not_availables

[('Acura TSX Sedan 2012', 'Sedan'),
 ('Audi A5 Coupe 2012', 'Coupe'),
 ('Audi R8 Coupe 2012', 'Coupe'),
 ('Audi TT Hatchback 2011', 'Hatchback'),
 ('Audi S5 Convertible 2012', 'Convertible'),
 ('Audi S5 Coupe 2012', 'Coupe'),
 ('Audi S4 Sedan 2012', 'Sedan'),
 ('Audi S4 Sedan 2007', 'Sedan'),
 ('Audi TT RS Coupe 2012', 'Coupe'),
 ('BMW M3 Coupe 2012', 'Coupe'),
 ('Cadillac CTS-V Sedan 2012', 'Sedan'),
 ('Chevrolet Corvette Convertible 2012', 'Convertible'),
 ('Chevrolet Camaro Convertible 2012', 'Convertible'),
 ('Chevrolet Sonic Sedan 2012', 'Sedan'),
 ('Chrysler Sebring Convertible 2010', 'Convertible'),
 ('Chrysler Crossfire Convertible 2008', 'Convertible'),
 ('FIAT 500 Convertible 2012', 'Convertible'),
 ('Ford Mustang Convertible 2007', 'Convertible'),
 ('Ford Focus Sedan 2007', 'Sedan'),
 ('Ford Fiesta Sedan 2012', 'Sedan'),
 ('Honda Accord Coupe 2012', 'Coupe'),
 ('Honda Accord Sedan 2012', 'Sedan'),
 ('Hyundai Accent Sedan 2012', 'Sedan'),
 ('Lamborghini Diablo Coupe 2001', 'C

In [22]:
class_labels.extend(not_availables)
len(class_labels)

190

In [23]:
# overview of unclear not availables 
na_unclears

['Acura Integra Type R 2001',
 'Chevrolet Corvette ZR1 2012',
 'Chevrolet Corvette Ron Fellows Edition Z06 2007',
 'Chevrolet Cobalt SS 2010',
 'Chrysler 300 SRT-8 2010',
 'FIAT 500 Abarth 2012']

In [24]:
# labeling via internet research 
unclears_labeled = [("FIAT 500 Abarth 2012", "Compact car"),
                   ("Acura Integra Type R 2001", "Compact car"), 
                   ("Chevrolet Corvette ZR1 2012", "Coupe"),
                   ("Chevrolet Corvette Ron Fellows Edition Z06 2007", "Coupe"), 
                   ("Chevrolet Cobalt SS 2010", "Coupe"), 
                   ("Chrysler 300 SRT-8 2010", "Sedan")]

class_labels.extend(unclears_labeled)

In [25]:
len(class_labels) # all classes are labeled now

196

In [26]:
# convert the class labels to dictornary for better mapping 
class_labels_dict = {key: value for key, value in class_labels}
class_labels_dict


{'AM General Hummer SUV 2000': 'SUV',
 'Aston Martin V8 Vantage Convertible 2012': 'Convertible',
 'Aston Martin V8 Vantage Coupe 2012': 'Coupe',
 'Aston Martin Virage Convertible 2012': 'Convertible',
 'Aston Martin Virage Coupe 2012': 'Coupe',
 'Audi RS 4 Convertible 2008': 'Convertible',
 'Audi TTS Coupe 2012': 'Coupe',
 'BMW ActiveHybrid 5 Sedan 2012': 'Sedan',
 'BMW 1 Series Convertible 2012': 'Convertible',
 'BMW 1 Series Coupe 2012': 'Coupe',
 'BMW 3 Series Sedan 2012': 'Sedan',
 'BMW 3 Series Wagon 2012': 'Wagon',
 'BMW 6 Series Convertible 2007': 'Convertible',
 'Bentley Continental Supersports Conv. Convertible 2012': 'Convertible',
 'Bentley Continental GT Coupe 2012': 'Coupe',
 'Chevrolet Monte Carlo Coupe 2007': 'Coupe',
 'Chrysler Town and Country Minivan 2012': 'Minivan',
 'Chrysler PT Cruiser Convertible 2008': 'Convertible',
 'Ferrari 458 Italia Convertible 2012': 'Convertible',
 'Ferrari 458 Italia Coupe 2012': 'Coupe',
 'Ford E-Series Wagon Van 2012': 'Van',
 'Geo Me

In [27]:
stanford_test_df.reset_index(drop=True, inplace=True)
stanford_test_df.drop("class_id", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stanford_test_df.drop("class_id", axis=1, inplace=True)


In [28]:
stanford_test_df

Unnamed: 0,name,is_test,car_name
0,car_ims/000046.jpg,1,AM General Hummer SUV 2000
1,car_ims/000047.jpg,1,AM General Hummer SUV 2000
2,car_ims/000048.jpg,1,AM General Hummer SUV 2000
3,car_ims/000049.jpg,1,AM General Hummer SUV 2000
4,car_ims/000050.jpg,1,AM General Hummer SUV 2000
...,...,...,...
8036,car_ims/016181.jpg,1,smart fortwo Convertible 2012
8037,car_ims/016182.jpg,1,smart fortwo Convertible 2012
8038,car_ims/016183.jpg,1,smart fortwo Convertible 2012
8039,car_ims/016184.jpg,1,smart fortwo Convertible 2012


#### mapping of the labels to test df

In [29]:
stanford_test_df["car_type"] = "not set"
stanford_test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stanford_test_df["car_type"] = "not set"


Unnamed: 0,name,is_test,car_name,car_type
0,car_ims/000046.jpg,1,AM General Hummer SUV 2000,not set
1,car_ims/000047.jpg,1,AM General Hummer SUV 2000,not set
2,car_ims/000048.jpg,1,AM General Hummer SUV 2000,not set
3,car_ims/000049.jpg,1,AM General Hummer SUV 2000,not set
4,car_ims/000050.jpg,1,AM General Hummer SUV 2000,not set
...,...,...,...,...
8036,car_ims/016181.jpg,1,smart fortwo Convertible 2012,not set
8037,car_ims/016182.jpg,1,smart fortwo Convertible 2012,not set
8038,car_ims/016183.jpg,1,smart fortwo Convertible 2012,not set
8039,car_ims/016184.jpg,1,smart fortwo Convertible 2012,not set


In [30]:
for index, data in stanford_test_df.iterrows():
    stanford_test_df.iloc[index, 3] = class_labels_dict[data["car_name"]]
    
stanford_test_df = stanford_test_df[["name", "car_name", "car_type", "is_test"]]
stanford_test_df

Unnamed: 0,name,car_name,car_type,is_test
0,car_ims/000046.jpg,AM General Hummer SUV 2000,SUV,1
1,car_ims/000047.jpg,AM General Hummer SUV 2000,SUV,1
2,car_ims/000048.jpg,AM General Hummer SUV 2000,SUV,1
3,car_ims/000049.jpg,AM General Hummer SUV 2000,SUV,1
4,car_ims/000050.jpg,AM General Hummer SUV 2000,SUV,1
...,...,...,...,...
8036,car_ims/016181.jpg,smart fortwo Convertible 2012,Convertible,1
8037,car_ims/016182.jpg,smart fortwo Convertible 2012,Convertible,1
8038,car_ims/016183.jpg,smart fortwo Convertible 2012,Convertible,1
8039,car_ims/016184.jpg,smart fortwo Convertible 2012,Convertible,1


#### prepare and label train dataset

In [31]:
cars_type_cleaned_cols["is_test"] = 0
cars_type_cleaned_cols.rename(columns={"new_filename": "name"}, inplace=True)
cars_type_cleaned_cols = cars_type_cleaned_cols[["name", "car_name", "car_type", "is_test"]]
cars_type_cleaned_cols

Unnamed: 0,name,car_name,car_type,is_test
0,00001_Audi TTS Coupe 2012.jpg,Audi TTS Coupe 2012,Coupe,0
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Cab,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Cab,0
...,...,...,...,...
8139,08140_Chrysler Town and Country Minivan 2012.jpg,Chrysler Town and Country Minivan 2012,Minivan,0
8140,08141_smart fortwo Convertible 2012.jpg,smart fortwo Convertible 2012,Convertible,0
8141,08142_Mercedes-Benz SL-Class Coupe 2009.jpg,Mercedes-Benz SL-Class Coupe 2009,Coupe,0
8142,08143_Ford GT Coupe 2006.jpg,Ford GT Coupe 2006,Coupe,0


In [32]:
for index, data in cars_type_cleaned_cols.iterrows():
    cars_type_cleaned_cols.iloc[index, 2] = class_labels_dict[data["car_name"]]
cars_type_cleaned_cols.isna().sum()


name          0
car_name      0
car_type    161
is_test       0
dtype: int64

In [33]:
nones = cars_type_cleaned_cols[cars_type_cleaned_cols["car_type"].isna()]
noes = nones["car_name"].unique()
noes

array(['Audi 100 Sedan 1994', 'Audi 100 Wagon 1994',
       'Nissan NV Passenger Van 2012', 'Suzuki Aerio Sedan 2007'],
      dtype=object)

In [34]:

for index, data in cars_type_cleaned_cols.iterrows(): 
    if data["car_name"] in noes: 
        none_body = data["car_name"].split(" ")[-2]
        cars_type_cleaned_cols.iloc[index, 2] = none_body
cars_type_cleaned_cols.isna().sum()


name        0
car_name    0
car_type    0
is_test     0
dtype: int64

In [35]:
cars_type_cleaned_cols["car_type"].unique()

array(['Coupe', 'Sedan', 'Pickup', 'Convertible', 'Crossover', 'SUV',
       'Minivan', 'Hatchback', 'Van', 'Cab', 'Wagon', 'Compact car',
       'IPL', 'Station Wagon'], dtype=object)

In [36]:
cars_type_cleaned_cols[cars_type_cleaned_cols["car_type"] == "IPL"]["car_name"].unique()

array(['Infiniti G Coupe IPL 2012'], dtype=object)

In [37]:
for index, data in cars_type_cleaned_cols.iterrows(): 
    if data["car_name"] == "Infiniti G Coupe IPL 2012" : 
        none_body = data["car_name"].split(" ")[-2]
        cars_type_cleaned_cols.iloc[index, 2] = "Coupe"
cars_type_cleaned_cols["car_type"].unique()

array(['Coupe', 'Sedan', 'Pickup', 'Convertible', 'Crossover', 'SUV',
       'Minivan', 'Hatchback', 'Van', 'Cab', 'Wagon', 'Compact car',
       'Station Wagon'], dtype=object)

In [38]:
cars_type_cleaned_cols

Unnamed: 0,name,car_name,car_type,is_test
0,00001_Audi TTS Coupe 2012.jpg,Audi TTS Coupe 2012,Coupe,0
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Pickup,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Pickup,0
...,...,...,...,...
8139,08140_Chrysler Town and Country Minivan 2012.jpg,Chrysler Town and Country Minivan 2012,Minivan,0
8140,08141_smart fortwo Convertible 2012.jpg,smart fortwo Convertible 2012,Convertible,0
8141,08142_Mercedes-Benz SL-Class Coupe 2009.jpg,Mercedes-Benz SL-Class Coupe 2009,Coupe,0
8142,08143_Ford GT Coupe 2006.jpg,Ford GT Coupe 2006,Coupe,0


In [39]:
stanford_test_df["car_name"].unique()


array(['AM General Hummer SUV 2000', 'Acura RL Sedan 2012',
       'Acura TL Sedan 2012', 'Acura TL Type-S 2008',
       'Acura TSX Sedan 2012', 'Acura Integra Type R 2001',
       'Acura ZDX Hatchback 2012',
       'Aston Martin V8 Vantage Convertible 2012',
       'Aston Martin V8 Vantage Coupe 2012',
       'Aston Martin Virage Convertible 2012',
       'Aston Martin Virage Coupe 2012', 'Audi RS 4 Convertible 2008',
       'Audi A5 Coupe 2012', 'Audi TTS Coupe 2012', 'Audi R8 Coupe 2012',
       'Audi V8 Sedan 1994', 'Audi 100 Sedan 1994', 'Audi 100 Wagon 1994',
       'Audi TT Hatchback 2011', 'Audi S6 Sedan 2011',
       'Audi S5 Convertible 2012', 'Audi S5 Coupe 2012',
       'Audi S4 Sedan 2012', 'Audi S4 Sedan 2007',
       'Audi TT RS Coupe 2012', 'BMW ActiveHybrid 5 Sedan 2012',
       'BMW 1 Series Convertible 2012', 'BMW 1 Series Coupe 2012',
       'BMW 3 Series Sedan 2012', 'BMW 3 Series Wagon 2012',
       'BMW 6 Series Convertible 2007', 'BMW X5 SUV 2007',
       'BMW X

#### Because the actual names of the test images doesn't fit to the namings of the dataframe and because we couldn't find a structure how they are probably were renamed, we've decided to scrape images from google corresponding to the classnames

#### in google-image-screaper we scraped for the classes and then wrote the "image_checking" script to delete unnessecary file formats etc. we've then saved the created struct into a pickle file which we can easily import and read here

In [137]:
pickle_file_path = "../Google-Image-Scraper/picture_struct.pickle"

with open(pickle_file_path, 'rb') as file:
    loaded_struct = pickle.load(file)
loaded_struct

{'Acura Integra Type R 2001': ['AcuraIntegraTypeR20010.jpeg',
  'AcuraIntegraTypeR20011.jpeg',
  'AcuraIntegraTypeR200110.jpeg',
  'AcuraIntegraTypeR2001100.jpeg',
  'AcuraIntegraTypeR2001101.jpeg',
  'AcuraIntegraTypeR2001102.jpeg',
  'AcuraIntegraTypeR2001103.jpeg',
  'AcuraIntegraTypeR2001105.jpeg',
  'AcuraIntegraTypeR2001107.jpeg',
  'AcuraIntegraTypeR2001108.jpeg',
  'AcuraIntegraTypeR2001110.jpeg',
  'AcuraIntegraTypeR2001111.jpeg',
  'AcuraIntegraTypeR2001112.jpeg',
  'AcuraIntegraTypeR2001113.jpeg',
  'AcuraIntegraTypeR2001114.jpeg',
  'AcuraIntegraTypeR200112.jpeg',
  'AcuraIntegraTypeR200113.jpeg',
  'AcuraIntegraTypeR200114.jpeg',
  'AcuraIntegraTypeR200115.jpeg',
  'AcuraIntegraTypeR200116.png',
  'AcuraIntegraTypeR200118.jpeg',
  'AcuraIntegraTypeR200119.jpeg',
  'AcuraIntegraTypeR20012.jpeg',
  'AcuraIntegraTypeR200120.jpeg',
  'AcuraIntegraTypeR200121.jpeg',
  'AcuraIntegraTypeR200122.jpeg',
  'AcuraIntegraTypeR200123.jpeg',
  'AcuraIntegraTypeR200124.jpeg',
  'AcuraInt

#### mapping of pickle struct to test df 

In [141]:
test = stanford_test_df.copy()
car_names = test["car_name"].unique()
counters = {key:{"current_index": 0, "count": 0} for key in car_names}
cname_counts = test["car_name"].value_counts()

err = []
for index, data in test.iterrows():
    cname = data["car_name"]
    if data["car_name"] in loaded_struct:
            test.iloc[index, 0] = loaded_struct[cname][counters[cname]["current_index"]]
            counters[cname]["current_index"] += 1
            counters[cname]["count"] += 1
    else: 
          if cname not in err: 
               err.append(cname)
test

Unnamed: 0,name,car_name,car_type,is_test
0,AMGeneralHummerSUV20000.jpeg,AM General Hummer SUV 2000,SUV,1
1,AMGeneralHummerSUV20001.jpeg,AM General Hummer SUV 2000,SUV,1
2,AMGeneralHummerSUV200010.jpeg,AM General Hummer SUV 2000,SUV,1
3,AMGeneralHummerSUV2000100.jpeg,AM General Hummer SUV 2000,SUV,1
4,AMGeneralHummerSUV2000102.jpeg,AM General Hummer SUV 2000,SUV,1
...,...,...,...,...
8036,smartfortwoConvertible201233.png,smart fortwo Convertible 2012,Convertible,1
8037,smartfortwoConvertible201235.jpeg,smart fortwo Convertible 2012,Convertible,1
8038,smartfortwoConvertible201236.jpeg,smart fortwo Convertible 2012,Convertible,1
8039,smartfortwoConvertible201237.jpeg,smart fortwo Convertible 2012,Convertible,1


In [139]:
err

['Ram C/V Cargo Van Minivan 2012']

In [None]:
loaded_struct['Ram C/V Cargo Van Minivan 2012'] = loaded_struct.pop('Ram CV Cargo Van Minivan 2012')

In [129]:
# checking if rename was successfull 
c = 0
for index, data in test.iterrows(): 
    if data["name"] != stanford_test_df.iloc[index, 0]: c+=1
if c == len(test):
    print("all names were renamed")
else: 
    print("something went wrong")

all names were renamed


#### create new rows for the rest of scraped images 

In [130]:
new_rows = []
for key in loaded_struct: 
    while counters[key]["count"] <= len(loaded_struct[key])-1: 
        car_type = class_labels_dict[key]
        name = loaded_struct[key][counters[key]["current_index"]]
        new_rows.append({"name":name, "car_name": key, "car_type": car_type, "is_test": 1 })
        counters[key]["count"] += 1
        counters[key]["current_index"] += 1
len(new_rows)


10032

#### append new rows to test

In [131]:
new_rows

[{'name': 'AcuraIntegraTypeR20015.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200150.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200151.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200152.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200156.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200157.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200158.jpeg',
  'car_name': 'Acura Integra Type R 2001',
  'car_type': 'Compact car',
  'is_test': 1},
 {'name': 'AcuraIntegraTypeR200159.jpeg',
  'car_name': 'Acura Integra Type R 2001',

In [142]:
test = test.append(new_rows, ignore_index=True)

  test = test.append(new_rows, ignore_index=True)


Unnamed: 0,name,car_name,car_type,is_test
0,AMGeneralHummerSUV20000.jpeg,AM General Hummer SUV 2000,SUV,1
1,AMGeneralHummerSUV20001.jpeg,AM General Hummer SUV 2000,SUV,1
2,AMGeneralHummerSUV200010.jpeg,AM General Hummer SUV 2000,SUV,1
3,AMGeneralHummerSUV2000100.jpeg,AM General Hummer SUV 2000,SUV,1
4,AMGeneralHummerSUV2000102.jpeg,AM General Hummer SUV 2000,SUV,1
...,...,...,...,...
18068,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
18069,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
18070,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
18071,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1


In [143]:
final_df = cars_type_cleaned_cols.append(test, ignore_index=True)

  final_df = cars_type_cleaned_cols.append(test, ignore_index=True)


In [144]:
final_df

Unnamed: 0,name,car_name,car_type,is_test
0,00001_Audi TTS Coupe 2012.jpg,Audi TTS Coupe 2012,Coupe,0
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Pickup,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Pickup,0
...,...,...,...,...
26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1


In [None]:
final_df_path = "../../data/meta/final_dataset.csv"
final_df.to_csv(final_df_path, index=False)

In [3]:
test_import = pd.read_csv("../../data/meta/final_dataset.csv")
test_import

Unnamed: 0,name,car_name,car_type,is_test
0,00001_Audi TTS Coupe 2012.jpg,Audi TTS Coupe 2012,Coupe,0
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Pickup,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Pickup,0
...,...,...,...,...
26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1


In [8]:
test_import["car_type"].value_counts()

Sedan            6844
SUV              4489
Coupe            4372
Convertible      3218
Pickup           2010
Hatchback        1752
Van               830
Minivan           825
Crossover         800
Compact car       243
Wagon             166
Station Wagon     137
Cab               118
IPL                80
Name: car_type, dtype: int64

In [7]:
test_import["car_type"].unique()

array(['Coupe', 'Sedan', 'Pickup', 'Convertible', 'Crossover', 'SUV',
       'Minivan', 'Hatchback', 'Van', 'Cab', 'Wagon', 'Compact car',
       'Station Wagon', nan, 'IPL'], dtype=object)

In [10]:
nans = test_import[test_import["car_type"].isna()]
nans

Unnamed: 0,name,car_name,car_type,is_test
8790,Audi100Sedan19940.jpeg,Audi 100 Sedan 1994,,1
8791,Audi100Sedan199410.jpeg,Audi 100 Sedan 1994,,1
8792,Audi100Sedan1994100.jpeg,Audi 100 Sedan 1994,,1
8793,Audi100Sedan1994101.jpeg,Audi 100 Sedan 1994,,1
8794,Audi100Sedan1994103.jpeg,Audi 100 Sedan 1994,,1
...,...,...,...,...
25492,SuzukiAerioSedan200795.jpeg,Suzuki Aerio Sedan 2007,,1
25493,SuzukiAerioSedan200796.jpeg,Suzuki Aerio Sedan 2007,,1
25494,SuzukiAerioSedan200797.jpeg,Suzuki Aerio Sedan 2007,,1
25495,SuzukiAerioSedan200798.jpeg,Suzuki Aerio Sedan 2007,,1


In [13]:
nan_uniques = nans["car_name"].unique()
nan_uniques

array(['Audi 100 Sedan 1994', 'Audi 100 Wagon 1994',
       'Nissan NV Passenger Van 2012', 'Suzuki Aerio Sedan 2007'],
      dtype=object)

In [14]:
nan_replacements = {}

for i in nan_uniques: 
    split = i.split(" ")
    body = split[-2]
    struct = {i:body}
    nan_replacements.update(struct)
nan_replacements

{'Audi 100 Sedan 1994': 'Sedan',
 'Audi 100 Wagon 1994': 'Wagon',
 'Nissan NV Passenger Van 2012': 'Van',
 'Suzuki Aerio Sedan 2007': 'Sedan'}

In [15]:
for index, data in test_import.iterrows(): 
    if data["car_name"] in nan_replacements: 
        test_import.iloc[index, 2] = nan_replacements[data["car_name"]]

test_import["car_type"].unique() # no NaNs anymore 

array(['Coupe', 'Sedan', 'Pickup', 'Convertible', 'Crossover', 'SUV',
       'Minivan', 'Hatchback', 'Van', 'Cab', 'Wagon', 'Compact car',
       'Station Wagon', 'IPL'], dtype=object)

In [17]:
test_import[test_import["car_type"] == "IPL"]["car_name"].unique()

array(['Infiniti G Coupe IPL 2012'], dtype=object)

In [18]:
for index, data in test_import.iterrows(): 
    if data["car_type"] == "IPL":
        test_import.iloc[index, 2] = "Coupe"

test_import["car_type"].unique() # delete IPL's 
        

array(['Coupe', 'Sedan', 'Pickup', 'Convertible', 'Crossover', 'SUV',
       'Minivan', 'Hatchback', 'Van', 'Cab', 'Wagon', 'Compact car',
       'Station Wagon'], dtype=object)

In [20]:
test_import.to_csv("../../data/meta/final_dataset_raw.csv", index=False)