# Data labeling
merge the collected data to one dataset

## Imports 

In [1]:
import pandas as pd 
import numpy as np
from os import listdir, remove, path
import plotly.express as px 
from scipy.io import loadmat
import requests
import json
from datetime import datetime
import traceback

### Plot labels of final dataset

In [7]:
final_dataset_path = "../../data/meta/final_dataset_raw.csv"
final_df = pd.read_csv(final_dataset_path)
final_df

Unnamed: 0.1,Unnamed: 0,name,car_name,car_type,is_test
0,1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
1,2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Pickup,0
2,3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
3,4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Pickup,0
4,5,00006_Geo Metro Convertible 1993.jpg,Geo Metro Convertible 1993,Convertible,0
...,...,...,...,...,...
26119,26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26120,26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26121,26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26122,26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1


In [8]:
car_types_final = final_df["car_type"].value_counts()

fig = px.bar(x=car_types_final.index, y= car_types_final.values)

# Customize the background color
fig.update_layout(
    plot_bgcolor='darkgrey',
    paper_bgcolor='black',
    title="Counts of labels", 
    title_font=dict(color='white'), 
    xaxis_title="Label",
    yaxis_title="Count", 
)
# Customize the color of bars
fig.update_traces(marker_color='black')

# Customize the color of labels
fig.update_layout(
    xaxis=dict(tickfont=dict(color='white', size=13)),
    yaxis=dict(tickfont=dict(color='white', size=13))
)
fig.show()

### because coupes, convertibles / cabs can be a wide range of car size, we'll drop them

In [4]:
cuts = ["Coupe", "Convertible", "Cab"]
new_labels_df = final_df[~final_df["car_type"].isin(cuts)]
new_labels_df

Unnamed: 0,name,car_name,car_type,is_test
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Sedan,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Pickup,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Sedan,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Pickup,0
6,00007_Dodge Journey SUV 2012.jpg,Dodge Journey SUV 2012,Crossover,0
...,...,...,...,...
26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1
26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Minivan,1


### Summarize labels to small, mediumsize, large 

In [5]:
new_labels_struct = {key:None for key in new_labels_df["car_type"].unique()}
new_labels_struct["Sedan"] = "Midsize"
new_labels_struct["SUV"] = "Large"
new_labels_struct["Pickup"] = "Large"
new_labels_struct["Hatchback"] = "Small"
new_labels_struct["Van"] = "Large"
new_labels_struct["Minivan"] = "Large"
new_labels_struct["Crossover"] = "Midsize"
new_labels_struct["Compact car"] = "Small"
new_labels_struct["Wagon"] = "Midsize"
new_labels_struct["Station Wagon"] = "Midsize"


In [6]:
new_labels_struct

{'Sedan': 'Midsize',
 'Pickup': 'Large',
 'Crossover': 'Midsize',
 'SUV': 'Large',
 'Minivan': 'Large',
 'Hatchback': 'Small',
 'Van': 'Large',
 'Wagon': 'Midsize',
 'Compact car': 'Small',
 'Station Wagon': 'Midsize'}

In [7]:
for index, data in new_labels_df.iterrows(): 
    if data["car_type"] in new_labels_struct:
        new_labels_df.at[index, "car_type"] = new_labels_struct[data["car_type"]]
    else: 
        print("error")

In [8]:
new_labels_df

Unnamed: 0,name,car_name,car_type,is_test
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Midsize,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Large,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Midsize,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Large,0
6,00007_Dodge Journey SUV 2012.jpg,Dodge Journey SUV 2012,Midsize,0
...,...,...,...,...
26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1


In [9]:
unis = new_labels_df["car_type"].value_counts().sort_values(ascending=True)

fig = px.bar(x=unis.values, y= unis.index, orientation="h")

# Customize the background color
fig.update_layout(
    plot_bgcolor='darkgrey',
    paper_bgcolor='black',
    title="Counts of labels", 
    title_font=dict(color='white'), 
    xaxis_title="Label",
    yaxis_title="Count", 
)
# Customize the color of bars
fig.update_traces(marker_color='black')

# Customize the color of labels
fig.update_layout(
    xaxis=dict(tickfont=dict(color='white', size=13)),
    yaxis=dict(tickfont=dict(color='white', size=13))
)
fig.show()

In [17]:
# these pictures are not correct
names = []

for index, data in new_labels_df.iterrows(): 
    name = data["name"]
    if name.startswith("BMWX6SUV2012"): 
        new_labels_df.drop(index, axis=0, inplace=True)

new_labels_df

new_labels_df.to_csv("../../data/meta/final_dataset_labeled.csv")