In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Generate synthetic dataset with data quality issues
np.random.seed(0)  # for reproducibility

# Sample sizes
num_records = 1000

# Generate random dates for Date column
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 1, 1)
date_range = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(num_records)]

# Generate random numerical and categorical data
numerical_data = np.random.normal(loc=100, scale=20, size=num_records)
categorical_data = np.random.choice(['A', 'B', 'C', 'D'], size=num_records)

# Introduce missing values, outliers, and inconsistent data types
missing_indices = random.sample(range(num_records), k=int(num_records * 0.1))  # 10% missing values
numerical_data[missing_indices] = np.nan

outlier_indices = random.sample(range(num_records), k=int(num_records * 0.05))  # 5% outliers
numerical_data[outlier_indices] = numerical_data[outlier_indices] * 10

# Create DataFrame
df = pd.DataFrame({
    'Date': date_range,
    'Numeric_Column': numerical_data,
    'Category_Column': categorical_data
})

# Introduce inconsistent data types
df['Numeric_Column'] = df['Numeric_Column'].astype(str)  # Example of inconsistent data type (should be numeric)

# Save the synthetic dataset to CSV
df.to_csv('realtime_dataset.csv', index=False)

# Display the first few rows of the dataset
print("Sample of the Real-Time Dataset with Data Quality Issues:")
print(df.head())

Sample of the Real-Time Dataset with Data Quality Issues:
        Date      Numeric_Column Category_Column
0 2021-02-26   135.2810469193533               A
1 2022-01-30  108.00314416734446               C
2 2022-10-13                 nan               D
3 2022-09-08  144.81786398402915               D
4 2020-06-09  137.35115980299935               C
