In [None]:
import pandas as pd
import re
from pandas_profiling import ProfileReport
from fuzzywuzzy import fuzz, process

# 1. Load Data
df = pd.read_csv('your_data.csv')

# 2. Data Profiling
profile = ProfileReport(df, title="Data Profile Report")
profile.to_file("data_profile_report.html")

# 3. Data Cleansing

# a. Handle Missing Values
df['numeric_column'] = df['numeric_column'].fillna(df['numeric_column'].mean())
df = df.dropna(subset=['categorical_column'])

# b. Remove Duplicates
df = df.drop_duplicates()

# c. Correct Data Types
df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')

# 4. Data Validation

# Validate age range
df = df[(df['age'] >= 0) & (df['age'] <= 120)]

# Validate email format
def is_valid_email(email):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return re.match(pattern, str(email)) is not None

df['email_valid'] = df['email'].apply(is_valid_email)

# 5. Data Transformation

# Normalize text
df['name'] = df['name'].str.lower()

# Standardize date format
df['standardized_date'] = pd.to_datetime(df['date_column'], errors='coerce').dt.strftime('%Y-%m-%d')

# 6. Duplicate Detection (Fuzzy Match)
def find_duplicates(name, choices, threshold=90):
    matches = process.extract(name, choices, scorer=fuzz.token_sort_ratio)
    return [match for match in matches if match[1] >= threshold]

df['potential_duplicates'] = df['name'].apply(lambda x: find_duplicates(x, df['name'].tolist()))

# 7. Data Quality Monitoring

# Missing value percentage
missing_data = df.isnull().mean() * 100
print("Columns with more than 5% missing values:")
print(missing_data[missing_data > 5])

# 8. Export Cleaned Data
df.to_csv('cleaned_data.csv', index=False)