In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport  # For data profiling

# Step 2: Load the Dataset
# Replace 'your_dataset.csv' with the actual file name or path
df = pd.read_csv('your_dataset.csv')  # Update with your filename

# Step 3: Initial Data Overview
print("Shape of the dataset:", df.shape)
print("\nFirst 5 rows of data:\n", df.head())
print("\nDataset Info:")
df.info()
print("\nDescriptive Statistics:\n", df.describe(include='all'))

# Step 4: Missing Value Analysis
print("\nMissing Values (Count):\n", df.isnull().sum())
print("\nMissing Values (Percentage):\n", df.isnull().mean() * 100)

# Step 5: Duplicate Records
duplicate_rows = df[df.duplicated()]
print("\nNumber of duplicate rows:", duplicate_rows.shape[0])
if not duplicate_rows.empty:
    print("\nDuplicate rows:\n", duplicate_rows)
    # Optional: Drop duplicates
    df = df.drop_duplicates()

# Step 6: Inconsistency Checks (example on a categorical column)
for col in df.select_dtypes(include='object').columns:
    print(f"\nUnique values in column '{col}':\n", df[col].unique())

# Optional: Standardize text (lowercase)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.lower().str.strip()

# Step 7: Outlier Detection (using Z-score)
from scipy import stats

numeric_df = df.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(numeric_df))
outliers = (z_scores > 3).any(axis=1)
print("\nOutliers detected (Z-score > 3):", outliers.sum())

# Step 8: Inconsistent Data Example
# Example: Check if 'start_date' > 'end_date' (if applicable)
# df['start_date'] = pd.to_datetime(df['start_date'])
# df['end_date'] = pd.to_datetime(df['end_date'])
# inconsistent_dates = df[df['start_date'] > df['end_date']]
# print("\nInconsistent Date Rows:\n", inconsistent_dates)

# Step 9: Generate Data Quality Report
profile = ProfileReport(df, title="Data Quality Report", explorative=True)
profile.to_file("data_quality_report.html")
print("\nData quality report generated and saved as 'data_quality_report.html'")