## Generate Dummy Data

In [1]:

import pandas as pd
import numpy as np
import random

def generate_car_data(num_rows=1000):
    # Seed for reproducibility
    np.random.seed(42)

    # Sample data
    makes = ['Toyota', 'Honda', 'Ford', 'Tesla', 'Chevrolet']
    models = ['Model A', 'Model B', 'Model C', 'Model D', 'Model E']
    fuel_types = ['Petrol', 'Diesel', 'Electric', 'Hybrid']
    colors = ['White', 'Black', 'Red', 'Blue', 'Silver']
    is_imported_options = [True, False]

    # Generating DataFrame
    df = pd.DataFrame({
        'Car ID': np.arange(1, num_rows + 1),
        'Make': np.random.choice(makes, num_rows),
        'Model': np.random.choice(models, num_rows),
        'Year': np.random.randint(1990, 2023, size=num_rows),
        'Engine Size (L)': np.round(np.random.uniform(1.0, 5.0, size=num_rows), 2),
        'Fuel Type': np.random.choice(fuel_types, num_rows),
        'Mileage (km)': np.random.randint(0, 300000, size=num_rows),
        'Price': np.random.randint(5000, 80000, size=num_rows),
        'Color': np.random.choice(colors, num_rows),
        'Is Imported': np.random.choice(is_imported_options, num_rows)
    })

    return df


In [2]:
df = generate_car_data()

In [3]:
df.head()

Unnamed: 0,Car ID,Make,Model,Year,Engine Size (L),Fuel Type,Mileage (km),Price,Color,Is Imported
0,1,Tesla,Model D,2017,1.93,Hybrid,109863,62986,Silver,False
1,2,Chevrolet,Model D,1996,2.12,Diesel,41777,43762,Blue,True
2,3,Ford,Model D,2021,4.21,Electric,46250,28600,Red,True
3,4,Chevrolet,Model D,2000,4.72,Hybrid,180418,24810,White,True
4,5,Chevrolet,Model E,1999,2.62,Hybrid,215834,62204,Black,False


# Define a function to calculate EDA metrics for each column

In [4]:

def eda_summary(df):
    eda_df = pd.DataFrame({
        'column_name': df.columns,
        'number_of_distinct_values': [df[col].nunique() for col in df.columns],
        'count_of_null': [df[col].isnull().sum() for col in df.columns],
        'count_of_not_null': [df[col].notnull().sum() for col in df.columns],
        'percentage_of_null': [df[col].isnull().mean() * 100 for col in df.columns],
        'percentage_of_not_null': [df[col].notnull().mean() * 100 for col in df.columns],
        'top_5_values': [df[col].value_counts().head(5).index.tolist() for col in df.columns]
    })
    return eda_df

# Apply the function to the DataFrame
eda_results = eda_summary(df)
eda_results.head(20)

Unnamed: 0,column_name,number_of_distinct_values,count_of_null,count_of_not_null,percentage_of_null,percentage_of_not_null,top_5_values
0,Car ID,1000,0,1000,0.0,100.0,"[1, 672, 659, 660, 661]"
1,Make,5,0,1000,0.0,100.0,"[Toyota, Tesla, Chevrolet, Ford, Honda]"
2,Model,5,0,1000,0.0,100.0,"[Model B, Model E, Model A, Model D, Model C]"
3,Year,33,0,1000,0.0,100.0,"[2010, 1993, 2017, 1997, 1991]"
4,Engine Size (L),373,0,1000,0.0,100.0,"[1.27, 1.97, 2.73, 4.92, 4.22]"
5,Fuel Type,4,0,1000,0.0,100.0,"[Electric, Hybrid, Diesel, Petrol]"
6,Mileage (km),1000,0,1000,0.0,100.0,"[109863, 291230, 297124, 22504, 54970]"
7,Price,994,0,1000,0.0,100.0,"[38332, 63560, 31183, 74351, 33036]"
8,Color,5,0,1000,0.0,100.0,"[Black, Red, White, Silver, Blue]"
9,Is Imported,2,0,1000,0.0,100.0,"[False, True]"
