In [2]:
!pip install faker


Collecting faker
  Obtaining dependency information for faker from https://files.pythonhosted.org/packages/78/5e/c8c3c5ea0896ab747db2e2889bf5a6f618ed291606de6513df56ad8670a8/faker-37.4.0-py3-none-any.whl.metadata
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.9 MB 1.1 MB/s eta 0:00:02
   ----- ---------------------------------- 0.3/1.9 MB 2.9 MB/s eta 0:00:01
   ------------ --------------------------- 0.6/1.9 MB 4.3 MB/s eta 0:00:01
   ------------------- -------------------- 1.0/1.9 MB 5.0 MB/s eta 0:00:01
   ------------------------ --------------- 1.2/1.9 MB 5.0 MB/s eta 0:00:01
   ------------------------------ --------- 1.5/1.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------- ----- 1.7/1.9 MB 5.4 MB/s eta 0:00:01
   --


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()
# Okay, setting up the number of base records. 1000 should give us plenty to mess with.
num_records = 1000
n = 1000  # Number of base rows
# I've expanded these a bit to give more variety – always good for 'realistic' data.
models = ['Model A', 'Model B', 'Model C', 'Model X', 'Model Z'] 
surfaces = ['Asphalt', 'Gravel', 'Concrete', 'Wet Asphalt', 'Snow'] 
points = ['Steering Wheel', 'Floor', 'Seat', 'Roof', 'Engine Bay', 'Trunk'] 
components = ['Engine', 'Cabin', 'Tires', 'Suspension', 'Brakes', 'Exhaust'] 
feedback_comments = [
    "Too much cabin noise at high speed",
    "Smooth ride overall",
    "Uncomfortable over bumps",
    "Noticeable vibrations at idle",
    "Excellent NVH",
    "Squealing noise when braking",
    "Rattling sound from dashboard",
    "Vibration felt through the pedals",
    "Rough idle, very shaky",
    "Quiet and refined cabin"
]

# Time to generate the core 'clean' dataset.
data = {
    "Vehicle_Model": np.random.choice(models, n),
    "Manufacture_Date": pd.to_datetime(np.random.choice(pd.date_range("2019-01-01", "2024-01-01"), n)),
    "Speed_kmph": np.random.randint(0, 140, n),
    "Operating_Temperature": np.random.uniform(-10, 50, n).round(1),
    "Road_Surface": np.random.choice(surfaces, n),
    "Timestamp": pd.to_datetime(np.random.choice(pd.date_range("2023-01-01", "2023-12-31", freq='H'), n)),
    "Noise_dB": np.random.normal(75, 10, n).round(1),
    "Component_Source": np.random.choice(components, n),
    "Vibration_magnitude": np.abs(np.random.normal(2.5, 1.2, n)).round(2),
    "Measurement_Point": np.random.choice(points, n),
    "Customer_Feedback_Score": np.random.randint(1, 6, n),
    "Feedback_Comment": np.random.choice(feedback_comments, n),
}

df = pd.DataFrame(data)

# --- Introduce Duplicates ---
# Grabbing a few random rows to duplicate. No, not just copy-pasting
duplicate_rows_to_add = 50
random_indices = np.random.choice(df.index, duplicate_rows_to_add, replace=False)
df_duplicates = df.loc[random_indices].copy()
df = pd.concat([df, df_duplicates], ignore_index=True)
print(f"Added {duplicate_rows_to_add} exact duplicate rows.")

# --- Introducing Missing Values (NaN) ---
# Set some values to NaN randomly
num_nan_entries = int(0.05 * len(df) * len(df.columns)) # 5% of all cells will be NaN
all_indices = [(r, c) for r in range(len(df)) for c in range(len(df.columns))]
random_nan_selection = random.sample(all_indices, num_nan_entries)

for r, c_idx in random_nan_selection:
    col_name = df.columns[c_idx]
    # Exclude Timestamp, Vehicle_Model, Manufacture_Date, and other critical identifiers
    
    # For demonstration, let's allow NaNs in numerical/categorical columns.
    if col_name not in ['Timestamp', 'Manufacture_Date', 'Vehicle_Model']:
        df.at[r, col_name] = np.nan
print(f"Introduced NaNs in approximately {num_nan_entries} cells.")

# --- Introducing Inconsistencies/Errors ---

# 1. Incorrect Data Types / Out-of-Range values for numerical columns
num_errors = 30 # Number of errors to inject
error_indices = np.random.choice(df.index, num_errors, replace=False)

for i in error_indices:
    # Speed_kmph: Inject a string or very high value
    if random.random() < 0.5: # 50% chance of string, 50% chance of out-of-range number
        df.at[i, 'Speed_kmph'] = random.choice(['N/A', 'ERROR', 'Data corrupted'])
    else:
        df.at[i, 'Speed_kmph'] = np.random.randint(200, 500) # Unrealistic speed

    # Noise_dB: Inject a negative value or a string
    if random.random() < 0.5:
        df.at[i, 'Noise_dB'] = random.choice([-5.0, 'Invalid DB', 'NaN_val'])
    else:
        df.at[i, 'Noise_dB'] = np.random.uniform(150, 200) # Extremely high noise

    # Vibration_magnitude: Inject a string or extremely high value
    if random.random() < 0.5:
        df.at[i, 'Vibration_magnitude'] = random.choice(['Vib_Error', 'NULL_VIB'])
    else:
        df.at[i, 'Vibration_magnitude'] = np.random.uniform(10.0, 50.0) # Unrealistic vibration

    # Customer_Feedback_Score: Inject a float or out-of-range integer
    if random.random() < 0.5:
        df.at[i, 'Customer_Feedback_Score'] = random.uniform(1.0, 6.0) # Float where int is expected
    else:
        df.at[i, 'Customer_Feedback_Score'] = np.random.randint(7, 10) # Score out of 1-5 range

# 2. Inconsistent Text Formatting / Typos for categorical columns
num_typos = 20 # Number of typos to inject
typo_indices = np.random.choice(df.index, num_typos, replace=False)

for i in typo_indices:
    # Road_Surface: Introduce variations in capitalization or misspellings
    original_surface = df.at[i, 'Road_Surface']
    if original_surface == 'Asphalt':
        df.at[i, 'Road_Surface'] = random.choice(['asphalt', 'AsPhalt', 'Ashpalt'])
    elif original_surface == 'Gravel':
        df.at[i, 'Road_Surface'] = random.choice(['gravel', 'GRAVEL', 'Gravle'])
    # Add more conditions for other surfaces if desired

    # Component_Source: Introduce variations or misspellings
    original_component = df.at[i, 'Component_Source']
    if original_component == 'Engine':
        df.at[i, 'Component_Source'] = random.choice(['engine ', 'ENGINe', 'Engin'])
    elif original_component == 'Tires':
        df.at[i, 'Component_Source'] = random.choice(['Tyres', 'TIRES', 'tire']) # Common misspelling/variation
    # Add more conditions for other components

    # Vehicle_Model: Introduce some slightly altered model names
    original_model = df.at[i, 'Vehicle_Model']
    if original_model == 'Model A':
        df.at[i, 'Vehicle_Model'] = random.choice(['model A', 'Model A ', 'Modle A'])
    elif original_model == 'Model B':
        df.at[i, 'Vehicle_Model'] = random.choice(['model B', 'Model_B', 'Mod. B'])

print(f"Introduced various errors and inconsistencies in {num_errors + num_typos} cells.")


# Shuffle the DataFrame to mix duplicates and NaNs
df = df.sample(frac=1).reset_index(drop=True)

# Display a sample to quickly check some issues
print("\nSample of the generated data with issues:")
print(df.head(10))
print("\nData Info (check for non-null counts):")
print(df.info())

Added 50 exact duplicate rows.
Introduced NaNs in approximately 630 cells.
Introduced various errors and inconsistencies in 50 cells.

Sample of the generated data with issues:
  Vehicle_Model Manufacture_Date Speed_kmph  Operating_Temperature  \
0       Model Z       2022-02-24        2.0                   -1.2   
1       Model A       2019-06-15        NaN                   41.7   
2       Model X       2019-04-13       97.0                   11.9   
3       Model X       2019-04-28        NaN                    4.4   
4       Model A       2020-03-05       35.0                   30.3   
5       Model Z       2023-12-21       41.0                   -6.8   
6       Model X       2021-05-31       99.0                   27.4   
7       Model A       2019-06-08       81.0                   13.2   
8       Model Z       2020-09-18       18.0                   -5.0   
9       Model A       2023-08-30       60.0                   42.1   

  Road_Surface           Timestamp Noise_dB Componen

  "Timestamp": pd.to_datetime(np.random.choice(pd.date_range("2023-01-01", "2023-12-31", freq='H'), n)),
  df.at[i, 'Vibration_magnitude'] = random.choice(['Vib_Error', 'NULL_VIB'])
  df.at[i, 'Noise_dB'] = random.choice([-5.0, 'Invalid DB', 'NaN_val'])
  df.at[i, 'Speed_kmph'] = random.choice(['N/A', 'ERROR', 'Data corrupted'])


In [5]:
# Save the noisy dataset
#df.to_csv("NVH_Data_Synthetic.csv", index=False)
print("\nGenerated 'NVH_Data_Synthetic.csv' with duplicates, missing values, and inconsistencies.")


Generated 'NVH_Data_Synthetic.csv' with duplicates, missing values, and inconsistencies.


In [6]:
display(df)

Unnamed: 0,Vehicle_Model,Manufacture_Date,Speed_kmph,Operating_Temperature,Road_Surface,Timestamp,Noise_dB,Component_Source,Vibration_magnitude,Measurement_Point,Customer_Feedback_Score,Feedback_Comment
0,Model C,2019-02-26,110.0,22.2,Asphalt,2023-11-19 02:00:00,84.9,,2.83,Seat,1.0,
1,Model Z,2019-09-02,50.0,37.5,Wet Asphalt,2023-05-03 11:00:00,60.1,Tires,2.93,Engine Bay,5.0,Quiet and refined cabin
2,Model A,2023-04-28,78.0,16.2,Asphalt,2023-05-20 17:00:00,62.8,Tires,2.8,Roof,4.0,Excellent NVH
3,Model C,2022-10-04,65.0,15.6,Snow,2023-02-01 03:00:00,83.3,Tires,1.28,Steering Wheel,2.0,Too much cabin noise at high speed
4,Model A,2023-07-20,31.0,36.5,Wet Asphalt,2023-09-03 15:00:00,67.9,Exhaust,1.85,Engine Bay,2.0,Excellent NVH
...,...,...,...,...,...,...,...,...,...,...,...,...
1045,Model A,2022-04-13,60.0,34.1,Asphalt,2023-07-12 03:00:00,59.0,Exhaust,1.67,Steering Wheel,1.0,Vibration felt through the pedals
1046,Model X,2023-09-16,70.0,28.5,Snow,2023-08-23 19:00:00,60.4,Brakes,2.04,Steering Wheel,4.0,Smooth ride overall
1047,Model X,2023-04-23,104.0,-6.2,,2023-09-12 17:00:00,70.0,Suspension,5.14,,2.0,Uncomfortable over bumps
1048,Model C,2020-05-09,4.0,23.8,Concrete,2023-06-30 18:00:00,86.9,Suspension,3.8,Steering Wheel,3.0,Vibration felt through the pedals


In [7]:
print(df.shape)  # Rows, columns
print(df.columns)  # Column names
print(df.dtypes)  # Data types


(1050, 12)
Index(['Vehicle_Model', 'Manufacture_Date', 'Speed_kmph',
       'Operating_Temperature', 'Road_Surface', 'Timestamp', 'Noise_dB',
       'Component_Source', 'Vibration_magnitude', 'Measurement_Point',
       'Customer_Feedback_Score', 'Feedback_Comment'],
      dtype='object')
Vehicle_Model                      object
Manufacture_Date           datetime64[ns]
Speed_kmph                         object
Operating_Temperature             float64
Road_Surface                       object
Timestamp                  datetime64[ns]
Noise_dB                           object
Component_Source                   object
Vibration_magnitude                object
Measurement_Point                  object
Customer_Feedback_Score           float64
Feedback_Comment                   object
dtype: object


In [8]:
#Checking for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Vehicle_Model               0
Manufacture_Date            0
Speed_kmph                 51
Operating_Temperature      54
Road_Surface               57
Timestamp                   0
Noise_dB                   45
Component_Source           41
Vibration_magnitude        51
Measurement_Point          56
Customer_Feedback_Score    50
Feedback_Comment           59
dtype: int64
