## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

In [4]:
# write your code from here
import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Step 1: Generate synthetic data
def generate_data(n=100):
    try:
        np.random.seed(42)
        data = {
            'age': np.random.normal(35, 10, n).tolist() + [120, -10],
            'salary': np.random.normal(60000, 15000, n).tolist() + [5000000, -10000]
        }
        df = pd.DataFrame(data)
        logging.info("Synthetic data generated with anomalies.")
        return df
    except Exception as e:
        logging.error(f"Failed to generate data: {e}")
        return pd.DataFrame()

# Step 2: Detect anomalies using Isolation Forest
def detect_anomalies(df):
    try:
        iso_forest = IsolationForest(contamination=0.05, random_state=42)
        df['anomaly'] = iso_forest.fit_predict(df[['age', 'salary']])
        df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})  # 1 if anomaly
        logging.info("Anomaly detection complete.")
        return df
    except Exception as e:
        logging.error(f"Anomaly detection failed: {e}")
        return df

# Step 3: Visualize the anomalies
def visualize_anomalies(df):
    try:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='age', y='salary', hue='anomaly', palette={0: 'blue', 1: 'red'})
        plt.title("Anomaly Detection with Isolation Forest")
        plt.xlabel("Age")
        plt.ylabel("Salary")
        plt.legend(title="Anomaly")
        plt.tight_layout()
        plt.show()
        logging.info("Anomalies visualized.")
    except Exception as e:
        logging.error(f"Visualization failed: {e}")

# Main pipeline
def main():
    df = generate_data()
    df = detect_anomalies(df)
    visualize_anomalies(df)

    # Save the results to CSV for review (optional)
    df.to_csv("anomaly_detection_output.csv", index=False)
    logging.info("Output saved to anomaly_detection_output.csv")

# Execute
if __name__ == "__main__":
    main()


INFO: Synthetic data generated with anomalies.
INFO: Anomaly detection complete.
ERROR: Visualization failed: Could not interpret value `age` for `x`. Value is a string, but `data` was not passed.
INFO: Output saved to anomaly_detection_output.csv


<Figure size 1000x600 with 0 Axes>

**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [5]:
# write your code from here

**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [6]:
# write your code from here
