In [71]:
import pandas as pd
import numpy as np
price_df = pd.read_csv("dataset/Price_table.csv")
sales_df = pd.read_csv("dataset/Sales_table.csv")
trim_df = pd.read_csv("dataset/Trim_table.csv")
ad_table_df = pd.read_csv("dataset/Ad_table (extra).csv")

car_brand_directory = {}
data = {}

def normalize_values(data):
    # Transpose the dictionary to work with columns
    transposed_data = list(zip(*data.values()))

    # Normalize each column
    normalized_data = []
    for col in transposed_data:
        min_val = min(col)
        max_val = max(col)
        normalized_col = [(val - min_val) / (max_val - min_val)
                          * 1 if max_val != min_val else 0 for val in col]
        normalized_data.append(normalized_col)

    # Transpose back to original structure
    normalized_data = list(zip(*normalized_data))

    # Create a new dictionary with normalized values
    normalized_dict = {key: list(values)
                       for key, values in zip(data.keys(), normalized_data)}

    return normalized_dict


def find_best_car(car_data):
    # Calculate the average of 5 elements in the list for each key
    averages = {key: sum(values) / len(values)
                for key, values in car_data.items()}

    # Find the key with the highest average value
    best_car = max(averages, key=averages.get)

    return best_car

# Price average for each series (1)
car_price = price_df.loc[price_df['Maker'] == 'Audi', ["Genmodel", "Entry_price"]]
car_price = car_price[car_price['Entry_price'] != 0]
price_average = car_price.groupby('Genmodel')['Entry_price'].mean()
price_average = price_average.to_dict()
# print(price_average)

# Average_mpg (3), Engine power (4), Top speed (5)
car_spec = ad_table_df.loc[ad_table_df['Maker']
                           == 'Audi', ['Genmodel', 'Average_mpg', 'Engine_power', 'Top_speed']]

car_spec = car_spec.dropna()
car_spec['Average_mpg'] = car_spec['Average_mpg'].str.replace(
    ' mpg', '').astype(float)
car_spec['Top_speed'] = car_spec['Top_speed'].str.replace(
    ' mph', '').astype(float)
car_spec_average = car_spec.groupby('Genmodel')[['Average_mpg', 'Engine_power', 'Top_speed']].mean()
car_spec_average = car_spec_average.to_dict()

# Sales data (2)
car_sales = sales_df.loc[sales_df['Maker'] == 'AUDI']
car_gen_model_names = car_sales['Genmodel'].drop_duplicates().to_list()

for genmodel in car_gen_model_names:
    specific_genmodel_df = car_sales[car_sales['Genmodel'] == genmodel]
    sales_columns = car_sales.columns[3:]
    sales_total = specific_genmodel_df[sales_columns].sum(axis=1).values[0]
    data[genmodel] = [0,0,0,0,0]
    data[genmodel][0] = sales_total
# at this stage we found the sales total (popularity) of each car gen model
new_data = {key.split()[-1]: value for key, value in data.items()}
# print(new_data)
gen_model_names = list(new_data.keys())
# print(gen_model_names)
for genmodel in gen_model_names:
    if (genmodel in price_average.keys() and genmodel in car_spec_average['Average_mpg'].keys()):
        new_data[genmodel][1] = price_average[genmodel]
        new_data[genmodel][2] = car_spec_average['Average_mpg'][genmodel]
        new_data[genmodel][3] = car_spec_average['Engine_power'][genmodel]
        new_data[genmodel][4] = car_spec_average['Top_speed'][genmodel]

car_brand_directory['Audi'] = new_data
car_brand_directory['Audi'] = {
    k: v for k, v in car_brand_directory['Audi'].items() if all(value != 0 for value in v)}


car_brand_directory['Audi'] = normalize_values(car_brand_directory['Audi']) # normalization

for genmodel in car_brand_directory['Audi'].keys():
    car_brand_directory['Audi'][genmodel][1] = 1 - car_brand_directory['Audi'][genmodel][1] 
    car_brand_directory['Audi'][genmodel][3] = 1 - \
        car_brand_directory['Audi'][genmodel][3]
    # reverse price, engine power scale, lower price higher score
# index 0: sales (higher popularity, higher score), 1: price (lower price, higher score])
# 2: Average_mpg (higher the higher), 3: Engine power (lower the best, cost saving), 4: Top speed (higher the better)

print(find_best_car(car_brand_directory['Audi']))
average_performance = [sum(x) / len(x)
                       for x in zip(*car_brand_directory['Audi'].values())]
print(average_performance)
print(car_brand_directory['Audi'][find_best_car(car_brand_directory['Audi'])])

A3
[0.2638460663996146, 0.7602518942906032, 0.5999564285499019, 0.6986379069544106, 0.3601381411806658]
[1.0, 0.9546968947456641, 0.8341141188581681, 0.8711492184305467, 0.2257207709841396]


In [72]:
# VOLVO Sales from 2016 - 2020
# X = sales_df[['Maker', "2016", "2017", "2018", "2019", "2020"]].values
volvo_sales = sales_df.loc[sales_df['Maker'] ==
                           'VOLVO', ["2016", "2017", "2018", "2019", "2020"]]
# sales by car ID
# volvo_sales_array = volvo_sales.values

# total sales (each year)
volvo_sales_sum = volvo_sales.sum(axis=0)

# return an array (each year)
volvo_sales_sum = volvo_sales.sum(axis=0).values

volvo_sales_sum

array([42914, 42273, 47659, 53773, 44328])

In [73]:
# Ferrari Sales from 2016 - 2020
X = sales_df[['Maker', "2016", "2017", "2018", "2019", "2020"]].values
ferrari_sales = sales_df.loc[sales_df['Maker'] ==
                             'FERRARI', ["2016", "2017", "2018", "2019", "2020"]]
# sales by car ID
# volvo_sales_array = volvo_sales.values

# total sales (each year)
ferrari_sales_sum = ferrari_sales.sum(axis=0)

# return an array (each year)
ferrari_sales_sum = ferrari_sales.sum(axis=0).values

np.array(ferrari_sales_sum)

array([472, 485, 488, 550, 468])

In [74]:
# Toyota Sales from 2016 - 2020
X = sales_df[['Maker', "2016", "2017", "2018", "2019", "2020"]].values
toyota_sales = sales_df.loc[sales_df['Maker'] ==
                            'TOYOTA', ["2016", "2017", "2018", "2019", "2020"]]
# sales by car ID
# volvo_sales_array = volvo_sales.values

# total sales (each year)
toyota_sales_sum = toyota_sales.sum(axis=0)

# return an array (each year)
toyota_sales_sum = toyota_sales.sum(axis=0).values

toyota_sales_sum

array([89822, 91954, 95267, 99904, 90020])

In [75]:
import pandas as pd

years = ["2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
         "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]

sales = {}

# add all existing brand to 'Brand' category
sales['Brand'] = sales_df['Maker'].drop_duplicates().to_list()

for i in range(len(years)):
    sales[years[i]] = sales_df.groupby(
        'Maker')[years[i]].sum().to_list()  # 2019 data by maker

total_brand_sum = len(sales['Brand'])

In [76]:
data = {'Total_Sales': []}

sales_df = pd.read_csv("dataset/Sales_table.csv")
sales_data_df = sales_df.drop(columns=['Maker', 'Genmodel', 'Genmodel_ID'])

    # all car brand name
car_brand_names = sales_df['Maker'].drop_duplicates().to_list()
for brand in car_brand_names:
    data[brand] = sales_data_df[sales_df['Maker']
                                    == brand].sum(axis=0).values[::-1]
    data['Total_Sales'].append((brand, sum(data[brand])))

print(data['FORD'])
data['Total_Sales'] = sorted(data['Total_Sales'], key=lambda n: n[1])[::-1][0:20]
data["Top_20_Brand"] = [t[0] for t in data['Total_Sales']]

[ 28528  52391  82486 108559 134923 174725 217008 225189 256458 237525
 233066 251763 283204 300241 311575 296259 260372 238331 225680 147165]


In [77]:
import pandas
import numpy as np
sales_forecast_df = pandas.read_csv("prediction_analysis/Projected_Sales_data.csv")

columns = list(sales_forecast_df.columns)
df_segment0 = columns[0:3]
df_segment1 = columns[3:23]
df_segment2 = columns[23:29]
sales_df = df_segment0 + df_segment2[::-1] + df_segment1
sales_df

['Maker',
 'Genmodel',
 'Genmodel_ID',
 '2026',
 '2025',
 '2024',
 '2023',
 '2022',
 '2021',
 '2020',
 '2019',
 '2018',
 '2017',
 '2016',
 '2015',
 '2014',
 '2013',
 '2012',
 '2011',
 '2010',
 '2009',
 '2008',
 '2007',
 '2006',
 '2005',
 '2004',
 '2003',
 '2002',
 '2001']

In [78]:
import pandas as pd
import numpy as np
price_df = pd.read_csv("dataset/Price_table.csv")
sales_df = pd.read_csv("dataset/Sales_table.csv")
trim_df = pd.read_csv("dataset/Trim_table.csv")
ad_table_df = pd.read_csv("dataset/Ad_table (extra).csv")

car_brand_directory = {}
data = {}


def normalize_values(data):
    # Transpose the dictionary to work with columns
    transposed_data = list(zip(*data.values()))

    # Normalize each column
    normalized_data = []
    for col in transposed_data:
        min_val = min(col)
        max_val = max(col)
        normalized_col = [(val - min_val) / (max_val - min_val)
                          * 1 if max_val != min_val else 0 for val in col]
        normalized_data.append(normalized_col)

    # Transpose back to original structure
    normalized_data = list(zip(*normalized_data))

    # Create a new dictionary with normalized values
    normalized_dict = {key: list(values)
                       for key, values in zip(data.keys(), normalized_data)}

    return normalized_dict


def find_best_car(car_data):
    # Calculate the average of 5 elements in the list for each key
    averages = {key: sum(values) / len(values)
                for key, values in car_data.items()}

    # Find the key with the highest average value
    best_car = max(averages, key=averages.get)

    return best_car


# Price average for each series (1)
car_price = price_df.loc[price_df['Maker']
                         == 'Mercedes-Benz', ["Genmodel", "Entry_price"]]
car_price = car_price[car_price['Entry_price'] != 0]
price_average = car_price.groupby('Genmodel')['Entry_price'].mean()
price_average = price_average.to_dict()
# print(price_average)

# Average_mpg (3), Engine power (4), Top speed (5)
car_spec = ad_table_df.loc[ad_table_df['Maker']
                           == 'Mercedes-Benz', ['Genmodel', 'Average_mpg', 'Engine_power', 'Top_speed']]

car_spec = car_spec.dropna()
car_spec['Average_mpg'] = car_spec['Average_mpg'].str.replace(
    ' mpg', '').astype(float)
car_spec['Top_speed'] = car_spec['Top_speed'].str.replace(
    ' mph', '').astype(float)
car_spec_average = car_spec.groupby(
    'Genmodel')[['Average_mpg', 'Engine_power', 'Top_speed']].mean()
car_spec_average = car_spec_average.to_dict()

# Sales data (2)
car_sales = sales_df.loc[sales_df['Maker'] == 'MERCEDES']
car_gen_model_names = car_sales['Genmodel'].drop_duplicates().to_list()

for genmodel in car_gen_model_names:
    specific_genmodel_df = car_sales[car_sales['Genmodel'] == genmodel]
    sales_columns = car_sales.columns[3:]
    sales_total = specific_genmodel_df[sales_columns].sum(axis=1).values[0]
    data[genmodel] = [0, 0, 0, 0, 0]
    data[genmodel][0] = sales_total
# at this stage we found the sales total (popularity) of each car gen model
new_data = {key.split()[-1]: value for key, value in data.items()}
# print(new_data)
gen_model_names = list(new_data.keys())
# print(gen_model_names)
for genmodel in gen_model_names:
    if (genmodel in price_average.keys() and genmodel in car_spec_average['Average_mpg'].keys()):
        new_data[genmodel][1] = price_average[genmodel]
        new_data[genmodel][2] = car_spec_average['Average_mpg'][genmodel]
        new_data[genmodel][3] = car_spec_average['Engine_power'][genmodel]
        new_data[genmodel][4] = car_spec_average['Top_speed'][genmodel]

car_brand_directory['Mercedes'] = new_data
car_brand_directory['Mercedes'] = {
    k: v for k, v in car_brand_directory['Mercedes'].items() if all(value != 0 for value in v)}


car_brand_directory['Mercedes'] = normalize_values(
    car_brand_directory['Mercedes'])  # normalization

for genmodel in car_brand_directory['Mercedes'].keys():
    car_brand_directory['Mercedes'][genmodel][1] = 1 - \
        car_brand_directory['Mercedes'][genmodel][1]
    car_brand_directory['Mercedes'][genmodel][3] = 1 - \
        car_brand_directory['Mercedes'][genmodel][3]
    # reverse price, engine power scale, lower price higher score
# index 0: sales (higher popularity, higher score), 1: price (lower price, higher score])
# 2: Average_mpg (higher the higher), 3: Engine power (lower the best, cost saving), 4: Top speed (higher the better)

# print(find_best_car(car_brand_directory['Mercedes']))

In [5]:
import pandas as pd
import numpy as np
projected_price_df = pd.read_csv("Prediction Analysis/Projected_Price_data.csv")
projected_price = projected_price_df[projected_price_df['Maker'] == 'Audi']
projected_price

Unnamed: 0,Maker,Genmodel,Genmodel_ID,2022,2023,2024,2025,2026
28,Audi,A1,7_1,16587.681748,17226.838694,17898.867016,18602.890038,19338.653479
29,Audi,A2,7_2,22475.475866,23373.402676,24307.574225,25279.342004,26290.141051
30,Audi,A3,7_3,22224.31175,23149.850225,24101.440438,25083.415397,26099.294153
31,Audi,A4,7_5,29585.567523,30781.958319,32022.314318,33309.561257,34646.391764
32,Audi,A5,7_9,33503.908123,34884.406357,36308.021948,37780.110361,39305.15205
33,Audi,A6,7_11,35635.226019,36969.447174,38384.393541,39875.087219,41438.813263
34,Audi,A7,7_15,50752.092657,52637.471279,54641.676572,56756.438143,58977.061825
35,Audi,A8,7_16,69357.759926,72136.465114,75025.000072,78028.153523,81150.787077
36,Audi,Q2,7_18,23232.159611,24089.848175,25003.323628,25968.373641,26982.550535
37,Audi,Q3,7_19,28333.471804,29357.997509,30456.148193,31621.075669,32848.595779
