In [None]:
import pandas as pd
import numpy as np

# define the features and their possible values
features = {
    "Model Type": ["Sedan", "SUV", "Coupe", "Hatchback", "Convertible", "Wagon", "Minivan", "Truck"],
    "Fuel Type": ["Gasoline", "Diesel", "Electric", "Hybrid"],
    "Transmission Type": ["Manual", "Automatic", "CVT"],
    "Drive Type": ["FWD", "RWD", "AWD"],
    "Safety Rating": ["1 star", "2 stars", "3 stars", "4 stars", "5 stars"],
    "Interior Material": ["Cloth", "Leather", "Synthetic"],
    "Infotainment System": ["Basic", "Advanced", "Premium", "None"],
    "Country of Manufacture": ["C1", "C2", "C3", "C4", "C5"],
    "Warranty Length": ["3 years", "5 years", "7 years", "10 years"],
    "Number of Doors": ["2", "4", "5"],
    "Number of Seats": ["2", "4", "5", "7"],
    "Air Conditioning": ["Yes", "No"],
    "Navigation System": ["None", "Basic", "Advanced"],
    "Tire Type": ["All-Season", "Summer", "Winter"],
    "Sunroof": ["Yes", "No"],
    "Sound System": ["Standard", "Premium", "High-end", "None"],
    "Cruise Control": ["Yes", "No"],
    "Bluetooth Connectivity": ["Yes", "No"],
}

# define the number of samples
num_samples = 3000000

# store the generated data
data = {}

model_type_probs = [0.22, 0.45, 0.23, 0.02, 0.01, 0.01, 0.01, 0.05]  # higher probabilities for Sedan and SUV
data['Model Type'] = np.random.choice(features['Model Type'], num_samples, p=model_type_probs)

is_suv = np.isin(data['Model Type'], ['SUV'])

# function to generate values with the required probability distribution
def generate_feature_with_high_probability(high_prob_values, low_prob_values, high_prob, size):
    """
    Generates values where `high_prob_values` are selected with `high_prob` probability,
    and `low_prob_values` are selected with the remaining probability.
    """
    return np.where(
        np.random.rand(size) < high_prob,
        np.random.choice(high_prob_values, size),
        np.random.choice(low_prob_values, size)
    )

# assign specific features for SUVs
data['Safety Rating'] = np.where(is_suv, 
                                 generate_feature_with_high_probability(["4 stars"], 
                                                                         features['Safety Rating'], 
                                                                         0.7, num_samples), 
                                 np.random.choice(features['Safety Rating'], num_samples))

data['Fuel Type'] = np.where(is_suv, 
                                 generate_feature_with_high_probability(["Diesel"], 
                                                                         features['Fuel Type'], 
                                                                         0.8, num_samples), 
                                 np.random.choice(features['Fuel Type'], num_samples))

data['Interior Material'] = np.where(is_suv, 
                                     generate_feature_with_high_probability(["Synthetic"], 
                                                                             features['Interior Material'], 
                                                                             0.9, num_samples), 
                                     np.random.choice(features['Interior Material'], num_samples))

data['Infotainment System'] = np.where(is_suv, 
                                       generate_feature_with_high_probability(["Premium"], 
                                                                               features['Infotainment System'], 
                                                                               0.9, num_samples),
                                       np.random.choice(features['Infotainment System'], num_samples))

data['Country of Manufacture'] = np.where(is_suv, 
                                          generate_feature_with_high_probability(["C5"], 
                                                                                  features['Country of Manufacture'], 
                                                                                  0.3, num_samples),
                                          np.random.choice(features['Country of Manufacture'], num_samples))

data['Warranty Length'] = np.where(is_suv, 
                                   generate_feature_with_high_probability(["10 years"], 
                                                                           features['Warranty Length'], 
                                                                           0.9, num_samples),
                                   np.random.choice(features['Warranty Length'], num_samples))

data['Number of Doors'] = np.where(is_suv, 
                                   generate_feature_with_high_probability(["5"], 
                                                                           features['Number of Doors'], 
                                                                           0.95, num_samples),
                                   np.random.choice(features['Number of Doors'], num_samples))

data['Number of Seats'] = np.where(is_suv, 
                                   generate_feature_with_high_probability(["5"], 
                                                                           features['Number of Seats'], 
                                                                           0.95, num_samples), 
                                   np.random.choice(features['Number of Seats'], num_samples))

data['Air Conditioning'] = np.where(is_suv, 
                                     generate_feature_with_high_probability(["Yes"], 
                                                                             features['Air Conditioning'], 
                                                                             0.95, num_samples), 
                                     np.random.choice(features['Air Conditioning'], num_samples))

data['Navigation System'] = np.where(is_suv, 
                                     generate_feature_with_high_probability(["Advanced"], 
                                                                             features['Navigation System'], 
                                                                             0.95, num_samples),
                                     np.random.choice(features['Navigation System'], num_samples))

data['Tire Type'] = np.where(is_suv, 
                             generate_feature_with_high_probability(["All-Season"], 
                                                                     features['Tire Type'], 
                                                                     0.95, num_samples),
                             np.random.choice(features['Tire Type'], num_samples))

data['Sunroof'] = np.random.choice(features['Sunroof'], num_samples)  # uniform distribution

data['Sound System'] = np.where(is_suv, 
                                 generate_feature_with_high_probability(["Premium"], 
                                                                         features['Sound System'], 
                                                                         0.95, num_samples), 
                                 np.random.choice(features['Sound System'], num_samples))

data['Cruise Control'] = np.where(is_suv, 
                                   generate_feature_with_high_probability(["Yes"], 
                                                                           features['Cruise Control'], 
                                                                           0.95, num_samples),
                                   np.random.choice(features['Cruise Control'], num_samples))

data['Bluetooth Connectivity'] = np.where(is_suv, 
                                           generate_feature_with_high_probability(["Yes"], 
                                                                                   features['Bluetooth Connectivity'], 
                                                                                   0.95, num_samples),
                                           np.random.choice(features['Bluetooth Connectivity'], num_samples))

data['Transmission Type'] = np.where(is_suv, 
                                           generate_feature_with_high_probability(["Manual"], 
                                                                                   features['Transmission Type'], 
                                                                                   0.7, num_samples),
                                           np.random.choice(features['Transmission Type'], num_samples))

data['Drive Type'] = np.where(is_suv, 
                                           generate_feature_with_high_probability(["RWD"], 
                                                                                   features['Drive Type'], 
                                                                                   0.95, num_samples),
                                           np.random.choice(features['Drive Type'], num_samples))

# convert the dictionary to a dataframe
df = pd.DataFrame(data)

# the label is a score-based evaluation defined as follows
def score_based_evaluation(row):
    score = 0
    
    if np.random.rand() > 0.05 and (row['Country of Manufacture'] != 'C2' or row['Model Type'] != 'SUV'):

        # add points based on safety rating
        safety_score = {'5 stars': 5, '4 stars': 4, '3 stars': 3, '2 stars': 2, '1 star': 1}
        score += safety_score[row['Safety Rating']]
        
        # add points based on fuel type
        fuel_score = {'Electric': 4, 'Hybrid': 3, 'Gasoline': 1, 'Diesel': 1}
        score += fuel_score[row['Fuel Type']]
        
        # add points based on warranty length
        warranty_score = {'10 years': 5, '7 years': 4, '5 years': 3, '3 years': 2}
        score += warranty_score[row['Warranty Length']]


        score += np.random.choice([1,3,4])

    elif row['Country of Manufacture'] == 'C2' and row['Model Type'] == 'SUV':
        r = np.random.rand()
        if r < 0.6: 
            score = 13
        elif r < 0.9: 
            score = 0
        else: 
            score = 2

    
    # determine the label based on the score
    if score >= 13:
        return 'Excellent'
    elif score >= 8:
        return 'Good'
    elif score >= 2:
        return 'Average'
    else:
        return 'Poor'

# apply the scoring function to each row
df['Car Evaluation'] = df.apply(score_based_evaluation, axis=1)

# display the first few rows of the dataset
print(df.head())

In [None]:
import os

current_path = os.getcwd()
project_root = os.path.abspath(os.path.join(current_path, '..', '..'))

data_folder = os.path.join(project_root, 'data')

os.makedirs(data_folder, exist_ok=True)

data_path = os.path.join(data_folder, 'car_data_synthetic.csv')

df.to_csv(data_path, index=False)