# **Pipeline**

necessary imports

In [None]:
%pip install sdv
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import os
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality, get_column_plot
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## What gets input into the pipeline?

- The Dataset
- The variable which we will be predicting
- *(Maybe which ML models are relevant to the task?)*
- *(Maybe the separator for the csv file, or we could write in the documentation that only csv files with ";" as a separator is accepted)*

## Preprocessing will be done before putting the dataset into the pipeline

- Consistent CSV


## **Synthesization Part**


Input data to the Pipeline

In [3]:
original_dataset_path = "../datasets/original/studentPerformance.csv"
separator = ';'
target_column = 'Target'

### Generating the Synthetic Data

In [None]:
# Extract dataset name from path (gets 'studentPerformance' from the path)
dataset_name = os.path.splitext(os.path.basename(original_dataset_path))[0]

original_data =pd.read_csv(original_dataset_path, sep=separator) #make it into a Pandas dataFrame (easier to work with)

print(f"\nProcessing dataset: {dataset_name}")
print(f"Original dataset size: {len(original_data)} rows")

metadata = SingleTableMetadata() # Initialize metadata

metadata.detect_from_dataframe(original_data) # Detect metadata from the pandas dataFrame


synthesizer_path = f"../src/synthesizers/{dataset_name}_synthesizer.pkl"

# Check if already existing synthesizer exists , if not create one
if os.path.exists(synthesizer_path):
    print(f"\nFound existing synthesizer for {dataset_name} at: {synthesizer_path}")
    print("Loading synthesizer...")
    synthesizer = CTGANSynthesizer.load(synthesizer_path)
    print("Synthesizer loaded successfully")
else:
    print(f"\nNo existing synthesizer for {dataset_name} found.")
    print("Training new synthesizer... (this may take a while)")
    synthesizer = CTGANSynthesizer(metadata)
    synthesizer.fit(original_data)
    synthesizer.save(synthesizer_path)
    print("New synthesizer trained and saved successfully")

# Generate synthetic data
print(f"\nUsing synthesizer '{dataset_name}_synthesizer' to generate synthetic data for dataset '{dataset_name}'")
synthetic_data = synthesizer.sample(num_rows=len(original_data))
print(f"Synthetic data generated successfully: {len(synthetic_data)} rows created")
synthetic_data

### Evaluating the Synthetic Data

In [None]:
from sdv.evaluation.single_table import run_diagnostic

# Run diagnostic
print(f"\nRunning diagnostic comparison for {dataset_name}...")
diagnostic = run_diagnostic(
    real_data=original_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

# Quality evaluation
print(f"\nEvaluating quality metrics for {dataset_name}...")
quality_report = evaluate_quality(
    original_data,
    synthetic_data,
    metadata
)

# Get details of column shapes
print("\nAnalyzing column distributions...")
column_shapes = quality_report.get_details('Column Shapes')
print(column_shapes)

quality_report.get_details('Column Shapes')

print("\nGenerating sample column distribution plot...")
fig = get_column_plot(
    real_data=original_data,
    synthetic_data=synthetic_data,
    column_name= target_column,  
    metadata=metadata
)
fig.show()

## **Machine Learning Part**


In [70]:
#original data is split into training and test
def prepare_original_data(data, target_column):
   X = data.drop(target_column, axis=1)
   y = data[target_column]
   return train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare synthetic data (no splitting needed)
def prepare_synthetic_data(data, target_column):
   X = data.drop(target_column, axis=1)
   y = data[target_column] 
   return X, y

# Prepare datasets using the target_column variable
X_train_original, X_test_original, y_train_original, y_test_original = prepare_original_data(original_data, target_column)
X_synthetic, y_synthetic = prepare_synthetic_data(synthetic_data, target_column)


datasets = {
   'Original': (X_train_original, y_train_original), # Training data only
   'Synthetic': (X_synthetic, y_synthetic) # Full synthetic data
}

model_results = []

In [None]:
#the models we will be evaluating
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC()
}


# Iterate through each dataset
for dataset_name, (X_train, y_train) in datasets.items():
    print(f'\nTraining and evaluating models on {dataset_name} dataset:')
    
    # Iterate through each model
    for model_name, model in models.items():
        print(f'\nProcessing {model_name}...')
        
        # Train and evaluate
        model.fit(X_train, y_train)
        
        # Cross validation score
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        
        # Predict on original test set
        predictions = model.predict(X_test_original)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test_original, 
            predictions, 
            average='weighted'
        )
        
        # Store results
        model_results.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'Accuracy': round(np.mean(scores), 4),
            'Precision': round(precision, 4),
            'Recall': round(recall, 4),
            'F1': round(f1, 4)
        })
        
        print(f'Completed {model_name} evaluation')

results_df = pd.DataFrame(model_results)

print("\nModel Performance Comparison:")
print(results_df.to_string(index=False))

In [75]:
model_names = []
original_accuracy_scores = []
original_precision_scores = []
synthetic_accuracy_scores = []
synthetic_precision_scores = []

# Separate original and synthetic results
original_results = results_df[results_df['Dataset'] == 'Original']
synthetic_results = results_df[results_df['Dataset'] == 'Synthetic']

In [76]:
for model in original_results['Model'].unique():
    model_names.append(model)
    original_accuracy_scores.append(original_results[original_results['Model'] == model]['Accuracy'].values[0])
    original_precision_scores.append(original_results[original_results['Model'] == model]['Precision'].values[0])
    synthetic_accuracy_scores.append(synthetic_results[synthetic_results['Model'] == model]['Accuracy'].values[0])
    synthetic_precision_scores.append(synthetic_results[synthetic_results['Model'] == model]['Precision'].values[0])

In [None]:
# Create visualization
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6))

x = np.arange(len(model_names))
width = 0.35

# Accuracy plot
ax1.bar(x - width/2, original_accuracy_scores, width, label='Original Data', color='skyblue')
ax1.bar(x + width/2, synthetic_accuracy_scores, width, label='Synthetic Data', color='lightcoral')
ax1.set_ylabel('Accuracy')
ax1.set_title('Accuracy Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(model_names, rotation=45)
ax1.legend()

# Precision plot
ax2.bar(x - width/2, original_precision_scores, width, label='Original Data', color='skyblue')
ax2.bar(x + width/2, synthetic_precision_scores, width, label='Synthetic Data', color='lightcoral')
ax2.set_ylabel('Precision')
ax2.set_title('Precision Comparison')
ax2.set_xticks(x)
ax2.set_xticklabels(model_names, rotation=45)
ax2.legend()

plt.suptitle(f'Model Performance: Original vs Synthetic Data for {dataset_name}', fontsize=14)
plt.tight_layout()
plt.show()

print("\nDetailed Model Performance Comparison:")
print(results_df.to_string(index=False))