In [4]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Iris Dataset
# - Load the dataset using Pandas.
# - Check for missing values in the entire dataset.


# Import necessary libraries
import pandas as pd
import numpy as np

# Load the Titanic dataset
# You can replace the path with your dataset location or use seaborn for example dataset
try:
    import seaborn as sns
    df = sns.load_dataset('titanic')
except:
    df = pd.read_csv('titanic.csv')  # fallback

# Display first 5 rows
print("Preview of the dataset:")
print(df.head())

# -------------------------------
# Part 1: Missing Value Analysis
# -------------------------------

print("\n--- Missing Values ---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# -------------------------------
# Part 2: Duplicates Detection
# -------------------------------

print("\n--- Duplicate Rows ---")
duplicates = df[df.duplicated()]
print(f"Total Duplicates: {duplicates.shape[0]}")
if not duplicates.empty:
    print(duplicates)

# -------------------------------
# Part 3: Basic Statistics
# -------------------------------

print("\n--- Basic Statistics ---")
print(df.describe(include='all'))

# -------------------------------
# Part 4: Data Types & Inconsistencies
# -------------------------------

print("\n--- Data Types ---")
print(df.dtypes)

# Detect inconsistent categorical values (example: "male", "Male", etc.)
print("\n--- Unique Values in Categorical Columns ---")
for col in df.select_dtypes(include='object'):
    print(f"{col}: {df[col].unique()}")

# -------------------------------
# Part 5: Final Data Quality Summary
# -------------------------------

print("\n--- Data Quality Summary ---")
data_quality_report = pd.DataFrame({
    "Data Type": df.dtypes,
    "Missing Values": df.isnull().sum(),
    "Unique Values": df.nunique(),
    "Duplicate Rows": [df.duplicated().sum()] * len(df.columns)
})
print(data_quality_report)

# Save report if needed
# data_quality_report.to_csv("data_quality_report.csv")






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Inconsistent Entries in a Sample Dataset
# - Assume you have a dataset with a 'Gender' column. Identify inconsistent entries like 'M', 'Male', or 'male'.









# Part 3: Generate a Data Quality Report

# Task 3: Iris Dataset Summary
# - Generate basic descriptive statistics for the Iris dataset.


import pandas as pd
import numpy as np
import os

# --- Part 0: Load Dataset Robustly ---
def load_dataset():
    file_path = "src/Module 3/Hands-on - Data Quality Assessment & Profiling/titanic.csv"
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
    else:
        try:
            import seaborn as sns
            df = sns.load_dataset("titanic")
        except:
            raise FileNotFoundError("Titanic dataset not found in path or seaborn.")
    return df

# --- Part 1: Check Missing Values ---
def check_missing_values(df):
    print("\n🔍 Missing Values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])
    return missing

# --- Part 2: Find Duplicates ---
def find_duplicates(df):
    print("\n🔁 Duplicate Rows:")
    duplicates = df[df.duplicated()]
    print(f"Found {len(duplicates)} duplicates.")
    return duplicates

# --- Part 3: Generate Basic Stats ---
def generate_basic_statistics(df):
    print("\n📊 Basic Statistics:")
    return df.describe(include='all')

# --- Part 4: Data Quality Report ---
def data_quality_report(df):
    print("\n📝 Data Quality Summary:")
    report = pd.DataFrame({
        "Data Type": df.dtypes,
        "Missing Values": df.isnull().sum(),
        "Unique Values": df.nunique(),
        "Duplicates": [df.duplicated().sum()] * len(df.columns)
    })
    print(report)
    return report

# --- Run Data Quality Pipeline ---
df = load_dataset()
print("📄 Preview of Data:")
print(df.head())

missing = check_missing_values(df)
duplicates = find_duplicates(df)
stats = generate_basic_statistics(df)
print(stats)
quality_report = data_quality_report(df)




Preview of the dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

--- Missing Values ---
age            177
embarked         2
deck           688
embark_town      2
dtype: int64

--- Duplicate Rows ---
Total Duplicates: 107
     survived  pclass     sex   age 