In [15]:
import os
import random
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta

# Initialize faker for generating synthetic data
fake = Faker()
# Set random seeds for reproducibility
# torch.manual_seed(42)
# np.random.seed(42)



# Define dataset properties
medication_names = [f"Medication_{i}" for i in range(1, 31)]
atc_codes = [f"ATC_{i}" for i in range(1, 31)]
diagnosis_codes = [f"D_{i}" for i in range(1, 21)]
prescribers = [f"Prescriber_{i}" for i in range(1, 11)]

# Get a date range within the past month
today = datetime.now()
one_month_ago = today - timedelta(days=30)

def generate_pharmacy_data(pharmacy_id, num_records):
    data = []
    for _ in range(num_records):
        transaction_id = fake.uuid4()
        age = random.randint(1, 100)
        num_medications = random.randint(1, 5)
        medications = random.sample(medication_names, num_medications)
        atc_codes_for_medications = random.choices(atc_codes, k=num_medications)
        dosages = [random.randint(1, 500) for _ in range(num_medications)]
        quantities = [random.randint(1, 10) for _ in range(num_medications)]
        costs_per_medication = [round(random.uniform(5, 50), 2) for _ in range(num_medications)]
        diagnosis_code = random.choice(diagnosis_codes)
        prescriber_id = random.choice(prescribers)
        date = (one_month_ago + timedelta(days=random.randint(0, 30))).strftime("%Y-%m-%d")

        for medication, atc_code, dosage, quantity, cost_per_medication in zip(
            medications, atc_codes_for_medications, dosages, quantities, costs_per_medication
        ):
            data.append([
                transaction_id, age, medication, atc_code, dosage, quantity,
                cost_per_medication, diagnosis_code, prescriber_id, date
            ])
    return data

# Define dataset structure
fields = [
    "Transaction_ID", "Age", "Medication_Name", "ATC_Code", "Dosage", 
    "Quantity", "Cost_Per_Medication", "Diagnosis_Code", "Prescriber_ID", "Date"
]

# Generate datasets
def generate_datasets():
    for city in range(1, 4):  # 3 cities
        for zone in range(1, 4):  # 3 zones per city
            for pharmacy in range(1, 5):  # 4 pharmacies per zone
                dataset_name = f"Ph{pharmacy:02d}_Z{zone:02d}_C{city:02d}"
                # Generate training dataset
                training_data = generate_pharmacy_data(dataset_name, 500)
                train_df = pd.DataFrame(training_data, columns=fields)
                train_df.to_csv(f"{dataset_name}_train.csv", index=False)

                # Generate testing dataset
                testing_data = generate_pharmacy_data(dataset_name, 200)
                test_df = pd.DataFrame(testing_data, columns=fields)
                #test_df.to_csv(f"{dataset_name}_test.csv", index=False)

generate_datasets()
print("Datasets generated with cost per medication and saved in the same directory as the script.")


Datasets generated with cost per medication and saved in the same directory as the script.


In [17]:
def generate_single_dataset(dataset_name, num_records):
    data = generate_pharmacy_data(dataset_name, num_records)
    df = pd.DataFrame(data, columns=fields)
    df.to_csv(f"{dataset_name}.csv", index=False)

# Generate the TEST dataset
generate_single_dataset("TEST", 200)

print("The TEST dataset with 200 transactions has been generated and saved as TEST.csv.")


The TEST dataset with 200 transactions has been generated and saved as TEST.csv.


In [22]:
from sklearn.svm import OneClassSVM
import pandas as pd
import pickle

# Define the features for anomaly detection
selected_features = ["Age", "Dosage", "Quantity"]

def train_svm_model(pharmacy_name, train_file):
    # Load the training dataset
    train_data = pd.read_csv(train_file)
    
    # Prepare training data
    X_train = train_data[selected_features]
    
    # Train One-Class SVM
    model = OneClassSVM(kernel="rbf", gamma="scale", nu=0.05)  # nu determines the proportion of anomalies
    model.fit(X_train)
    
    # Save the pharmacy model
    with open(f"{pharmacy_name}_svm_model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    print(f"One-Class SVM model for {pharmacy_name} trained and saved.")
    return model

def test_svm_model(model, test_data):
    # Extract selected features for testing
    X_test = test_data[selected_features]
    
    # Predict anomalies (-1 indicates anomaly, 1 indicates normal)
    predictions = model.predict(X_test)
    test_data["Anomaly"] = predictions
    anomalies = test_data[test_data["Anomaly"] == -1]
    
    return anomalies

# Example for one pharmacy
pharmacy_name = "Ph01_Z01_C01"
train_file = f"{pharmacy_name}_train.csv"
model = train_svm_model(pharmacy_name, train_file)

# Load the TEST dataset
test_data = pd.read_csv("TEST.csv")
anomalies = test_svm_model(model, test_data)

print(f"Anomalous transactions detected by {pharmacy_name}:")
print(anomalies)


One-Class SVM model for Ph01_Z01_C01 trained and saved.
Anomalous transactions detected by Ph01_Z01_C01:
                           Transaction_ID  Age Medication_Name ATC_Code  \
8    19be4e82-52d6-4bb2-8ec9-18069f873132   95   Medication_22   ATC_14   
10   19be4e82-52d6-4bb2-8ec9-18069f873132   95   Medication_30   ATC_28   
15   56f2c3e6-a158-48db-bad2-542c7427ebeb   81   Medication_11    ATC_7   
22   d3edc117-f6c6-47b6-9c0a-2cee1dec54da    8   Medication_26   ATC_16   
36   885e2c35-da8b-4475-bb7e-cd9837bd9cc9   78   Medication_18    ATC_7   
54   4ecf52c6-aeef-41c8-aae6-cddf0d5de912   28    Medication_5    ATC_1   
94   a819f8c8-5b25-4a79-969f-1f883b375fd0    1   Medication_19    ATC_4   
95   a819f8c8-5b25-4a79-969f-1f883b375fd0    1    Medication_9   ATC_26   
96   a819f8c8-5b25-4a79-969f-1f883b375fd0    1   Medication_11    ATC_3   
108  09a72801-b8ab-43b3-bfcf-3e03235d6aae   23   Medication_25    ATC_6   
112  9f55c215-a16e-4e0a-bfee-474ae9e1d075   90    Medication_2   ATC_2

In [24]:
from sklearn.svm import OneClassSVM
import pandas as pd
import pickle

# Define the features for anomaly detection
selected_features = ["Age", "Dosage", "Quantity"]

def train_svm_model(pharmacy_name, train_file):
    # Load the training dataset
    train_data = pd.read_csv(train_file)
    
    # Prepare training data
    X_train = train_data[selected_features]
    
    # Train One-Class SVM
    model = OneClassSVM(kernel="rbf", gamma="scale", nu=0.05)  # nu determines the proportion of anomalies
    model.fit(X_train)
    
    # Save the pharmacy model
    with open(f"{pharmacy_name}_svm_model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    print(f"One-Class SVM model for {pharmacy_name} trained and saved.")
    return model

def test_svm_model(model, test_data):
    # Extract selected features for testing
    X_test = test_data[selected_features]
    
    # Predict anomalies (-1 indicates anomaly, 1 indicates normal)
    predictions = model.predict(X_test)
    test_data["Anomaly"] = predictions
    anomalies = test_data[test_data["Anomaly"] == -1]
    
    return anomalies

# Example for one pharmacy
pharmacy_name = "Ph01_Z01_C01"
train_file = f"{pharmacy_name}_train.csv"
model = train_svm_model(pharmacy_name, train_file)

# Load the TEST dataset
test_data = pd.read_csv("TEST.csv")
anomalies = test_svm_model(model, test_data)

print(f"Anomalous transactions detected by {pharmacy_name}:")
print(anomalies)


One-Class SVM model for Ph01_Z01_C01 trained and saved.
Anomalous transactions detected by Ph01_Z01_C01:
                           Transaction_ID  Age Medication_Name ATC_Code  \
8    19be4e82-52d6-4bb2-8ec9-18069f873132   95   Medication_22   ATC_14   
10   19be4e82-52d6-4bb2-8ec9-18069f873132   95   Medication_30   ATC_28   
15   56f2c3e6-a158-48db-bad2-542c7427ebeb   81   Medication_11    ATC_7   
22   d3edc117-f6c6-47b6-9c0a-2cee1dec54da    8   Medication_26   ATC_16   
36   885e2c35-da8b-4475-bb7e-cd9837bd9cc9   78   Medication_18    ATC_7   
54   4ecf52c6-aeef-41c8-aae6-cddf0d5de912   28    Medication_5    ATC_1   
94   a819f8c8-5b25-4a79-969f-1f883b375fd0    1   Medication_19    ATC_4   
95   a819f8c8-5b25-4a79-969f-1f883b375fd0    1    Medication_9   ATC_26   
96   a819f8c8-5b25-4a79-969f-1f883b375fd0    1   Medication_11    ATC_3   
108  09a72801-b8ab-43b3-bfcf-3e03235d6aae   23   Medication_25    ATC_6   
112  9f55c215-a16e-4e0a-bfee-474ae9e1d075   90    Medication_2   ATC_2

In [14]:
def aggregate_models(models):
    # Placeholder for aggregation logic
    # For example, averaging decision thresholds or combining trees
    aggregated_model = models[0]  # Simplified: Use one of the models directly
    return aggregated_model

# Example: Aggregate models for a zone
pharmacy_models = [
    pickle.load(open(f"Ph01_Z01_C01_model.pkl", "rb")),
    pickle.load(open(f"Ph02_Z01_C01_model.pkl", "rb"))
]
zone_model = aggregate_models(pharmacy_models)

# Test the aggregated zone model
zone_anomalies = test_model(zone_model, test_data)
print("Anomalous transactions detected by Zone model:")
print(zone_anomalies)


FileNotFoundError: [Errno 2] No such file or directory: 'Ph02_Z01_C01_model.pkl'

In [26]:
def test_svm_model_and_save_results(model, test_data, output_file):
    # Extract selected features for testing
    X_test = test_data[selected_features]
    
    # Predict anomalies (-1 indicates anomaly, 1 indicates normal)
    predictions = model.predict(X_test)
    test_data["Anomaly"] = predictions
    
    # Save the results to a CSV file
    test_data.to_csv(output_file, index=False)
    print(f"Test results saved to {output_file}")

    # Return the anomalies for reference
    anomalies = test_data[test_data["Anomaly"] == -1]
    return anomalies

# Example usage
output_file = "Ph01_Z01_C01_test_results.csv"
anomalies = test_svm_model_and_save_results(model, test_data, output_file)

print(f"Anomalous transactions saved. Total anomalies detected: {len(anomalies)}")


Test results saved to Ph01_Z01_C01_test_results.csv
Anomalous transactions saved. Total anomalies detected: 33


In [29]:
import numpy as np

# Define the normality thresholds (e.g., mean ± 3 * std) for each feature
def calculate_feature_thresholds(train_data, selected_features):
    thresholds = {}
    for feature in selected_features:
        feature_data = train_data[feature]
        mean = np.mean(feature_data)
        std = np.std(feature_data)
        thresholds[feature] = (mean - 3*std, mean + 3*std)  # Normal range: mean ± 3*std
    return thresholds

# Function to detect and record which features caused the anomaly
def detect_anomaly_causes(test_data, thresholds, selected_features):
    anomaly_causes = []
    for idx, row in test_data.iterrows():
        causes = []
        for feature in selected_features:
            lower, upper = thresholds[feature]
            if row[feature] < lower or row[feature] > upper:
                causes.append(feature)
        if causes:
            anomaly_causes.append(", ".join(causes))
        else:
            anomaly_causes.append("None")
    return anomaly_causes


In [35]:
def test_svm_model_and_save_results_with_causes(model, test_data, thresholds, output_file):
    # Extract selected features for testing
    X_test = test_data[selected_features]
    
    # Predict anomalies (-1 indicates anomaly, 1 indicates normal)
    predictions = model.predict(X_test)
    test_data["Anomaly"] = predictions
    
    # Detect what caused the anomaly (which features deviated)
    anomaly_causes = detect_anomaly_causes(test_data, thresholds, selected_features)
    
    # Add the "Anomaly_Causes" column to the dataset
    test_data["Anomaly_Causes"] = anomaly_causes
    
    # Save the results to a CSV file
    test_data.to_csv(output_file, index=False)
    print(f"Test results with anomaly causes saved to {output_file}")

    # Return the anomalies for reference
    anomalies = test_data[test_data["Anomaly"] == -1]
    return anomalies

import pandas as pd

# Load the training dataset (make sure to use the correct file path)
train_data = pd.read_csv("Ph01_Z01_C01_train.csv")  # Replace with the correct filename

# Define the selected features for anomaly detection
selected_features = ['Age', 'Dosage', 'Quantity']

# Calculate the thresholds for the selected features
thresholds = calculate_feature_thresholds(train_data, selected_features)

# Load the test dataset (again, replace with the correct file path)
test_data = pd.read_csv("Ph01_Z01_C01_test.csv")  # Replace with the correct filename

# Now you can call the function to test the model and save the results with causes
output_file = "Ph01_Z01_C01_test_results_with_causes.csv"
anomalies = test_svm_model_and_save_results_with_causes(model, test_data, thresholds, output_file)

# Print the results
print(f"Anomalous transactions saved with causes. Total anomalies detected: {len(anomalies)}")


Test results with anomaly causes saved to Ph01_Z01_C01_test_results_with_causes.csv
Anomalous transactions saved with causes. Total anomalies detected: 39


In [38]:
# Function to detect medication combinations
def get_medication_combination(row, all_medications):
    medications = set(row['Medication_Name'].split(', '))  # Assuming medication names are separated by commas
    return tuple(sorted(medications))

# Apply the function to extract medication combinations
train_data['medication_combination'] = train_data.apply(lambda row: get_medication_combination(row, all_medications), axis=1)


NameError: name 'all_medications' is not defined