## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

In [7]:
import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Generate synthetic dataset
def create_dataset(n=120):
    try:
        np.random.seed(42)
        ages = np.random.normal(40, 12, n).tolist() + [150, -20]
        salaries = np.random.normal(70000, 20000, n).tolist() + [8000000, -20000]
        df = pd.DataFrame({'employee_age': ages, 'employee_salary': salaries})
        logging.info(f"Generated {len(df)} records with injected anomalies.")
        return df
    except Exception as e:
        logging.error(f"Error in data generation: {e}")
        return pd.DataFrame()

# Step 2: Apply Isolation Forest for anomaly detection
def run_isolation_forest(df):
    try:
        model = IsolationForest(n_estimators=100, contamination=0.05, random_state=0)
        df['anomaly_score'] = model.decision_function(df[['employee_age', 'employee_salary']])
        df['is_anomaly'] = model.predict(df[['employee_age', 'employee_salary']])
        df['is_anomaly'] = df['is_anomaly'].map({1: 0, -1: 1})
        logging.info("Isolation Forest applied. Anomalies flagged.")
        return df
    except Exception as e:
        logging.error(f"Error during anomaly detection: {e}")
        return df

# Step 3: Visualize results
def plot_anomalies(df):
    try:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(
            x='employee_age',
            y='employee_salary',
            hue='is_anomaly',
            palette={0: 'green', 1: 'red'}
        )
        plt.yscale("log")  # log scale to highlight large salary anomalies
        plt.title("Detected Anomalies using Isolation Forest")
        plt.xlabel("Age")
        plt.ylabel("Salary (Log Scale)")
        plt.legend(title="Anomaly")
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        logging.info("Anomaly plot displayed.")
    except Exception as e:
        logging.error(f"Plotting failed: {e}")

# Step 4: Main pipeline
def main():
    df = create_dataset()
    df = run_isolation_forest(df)
    plot_anomalies(df)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"anomaly_output_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    logging.info(f"Results saved to {output_file}")

# Run
if __name__ == "__main__":
    main()


INFO: Generated 122 records with injected anomalies.
ERROR: Error during anomaly detection: This IsolationForest instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
ERROR: Plotting failed: Could not interpret value `employee_age` for `x`. Value is a string, but `data` was not passed.
INFO: Results saved to anomaly_output_20250513_174225.csv


<Figure size 1000x600 with 0 Axes>

**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [8]:
# write your code from here

**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [9]:
# write your code from here
