In [None]:
# Install required libraries
!pip install --index-url https://test.pypi.org/simple/ synthetic-data-generator
!pip install scikit-learn


In [None]:
# Import necessary libraries
import pandas as pd  # For working with DataFrames
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting dataset
from sklearn.linear_model import LogisticRegression  # For training a simple logistic regression model
from sklearn.metrics import accuracy_score, classification_report  # For evaluating model performance
from synthetic_data_generator import SyntheticDataGenerator  # Import the custom data generator


In [None]:
# Data Collection - Generating Synthetic Data
# Initialize the synthetic data generator with a random seed for reproducibility
generator = SyntheticDataGenerator(seed=42)

# Generate synthetic data with 2 continuous features, 1 categorical feature, and 1000 samples
df = generator.create_synthetic_dataset(continuous_features=2, categorical_features=1, num_samples=1000)

# Display the first few rows of the generated dataset
print('Synthetic Data Sample:')
print(df.head())



In [None]:
# Data Preprocessing - Prepare the data for the machine learning model
# Convert the categorical feature to numerical encoding
df['categorical_feature_1'] = df['categorical_feature_1'].astype('category').cat.codes

# Split the dataset into features (X) and target (y)
X = df[['continuous_feature_1', 'continuous_feature_2']]  # Features
y = df['categorical_feature_1']  # Target

# Split the data into training (80%) and testing sets (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model Training - Train a simple Logistic Regression model
# Initialize the logistic regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train)


In [None]:
# Model Evaluation - Evaluate the model's performance on the test set
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate and display the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Display the detailed classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))


In [None]:
# Optional - Adding noise to the dataset and retraining the model
# Generate a noisy dataset by adding noise to continuous and categorical features
noisy_df = generator.add_noise(df, continuous_noise_level=0.05, categorical_noise_level=0.1)

# Convert categorical feature to numerical encoding
noisy_df['categorical_feature_1'] = noisy_df['categorical_feature_1'].astype('category').cat.codes

# Redo the splitting process for the noisy dataset
X_noisy = noisy_df[['continuous_feature_1', 'continuous_feature_2']]
y_noisy = noisy_df['categorical_feature_1']

# Split noisy data into training and testing sets
X_train_noisy, X_test_noisy, y_train_noisy, y_test_noisy = train_test_split(X_noisy, y_noisy, test_size=0.2, random_state=42)

# Train the model on noisy data
model_noisy = LogisticRegression()
model_noisy.fit(X_train_noisy, y_train_noisy)

# Evaluate the model trained on noisy data
y_pred_noisy = model_noisy.predict(X_test_noisy)
accuracy_noisy = accuracy_score(y_test_noisy, y_pred_noisy)
print(f'Accuracy with Noise: {accuracy_noisy * 100:.2f}%')
