In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
# Replace with your actual dataset path if needed
df = pd.read_csv("your_dataset.csv")

# Display basic information
print("Dataset Info:")
df.info()

print("\nDataset Shape:")
print(df.shape)

print("\nFirst 5 Rows:")
print(df.head())

# Summary statistics
print("\nDescriptive Statistics:")
print(df.describe(include='all'))

# ------------------------------------------
# Step 1: Check for Missing Values
# ------------------------------------------
print("\nMissing Values:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print(missing_data)

# ------------------------------------------
# Step 2: Check for Duplicates
# ------------------------------------------
print("\nDuplicate Rows:")
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

print(f"\nTotal Duplicates: {duplicate_rows.shape[0]}")

# Optionally remove duplicates
df = df.drop_duplicates()

# ------------------------------------------
# Step 3: Check for Data Inconsistencies
# (e.g., inconsistent casing, misspellings)
# ------------------------------------------
print("\nUnique Values in Categorical Columns:")
for col in df.select_dtypes(include='object').columns:
    print(f"{col} -> {df[col].unique()}")

# ------------------------------------------
# Step 4: Check Data Types
# ------------------------------------------
print("\nData Types:")
print(df.dtypes)

# ------------------------------------------
# Step 5: Basic Statistical Analysis
# ------------------------------------------
print("\nMean Values:")
print(df.select_dtypes(include=np.number).mean())

print("\nMedian Values:")
print(df.select_dtypes(include=np.number).median())

print("\nStandard Deviation:")
print(df.select_dtypes(include=np.number).std())

# ------------------------------------------
# Step 6: Generate Data Profiling Report
# ------------------------------------------
# Install the library (uncomment if not already installed)
# !pip install ydata-profiling

from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Data Profiling Report", explorative=True)
profile.to_file("data_profiling_report.html")

print("\nData profiling report has been generated and saved as 'data_profiling_report.html'.")