In [4]:
import pandas as pd
import numpy as np
import os

# Set random seed for reproducibility
np.random.seed(42)
# proporcao de desbalanciamento
# multiclasse
# UCI

def generate_healthcare_dataset(n_samples=1000):
    """Generate a synthetic healthcare dataset with constraints."""
    ages = np.random.randint(20, 90, size=n_samples)
    cholesterol = np.random.randint(100, 300, size=n_samples)
    bmi = np.round(np.random.uniform(18, 40, size=n_samples), 2)
    exercise = np.random.choice(["Yes", "No"], size=n_samples, p=[0.7, 0.3])

    # Target variable (high risk)
    high_risk = []
    for i in range(n_samples):
        prob = 0.2
        if ages[i] > 50 and cholesterol[i] > 200:
            prob += 0.3
        if exercise[i] == "Yes" and bmi[i] < 25:
            prob -= 0.2
        high_risk.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "Age": ages,
        "Cholesterol": cholesterol,
        "BMI": bmi,
        "Exercise": exercise,
        "HighRisk": high_risk
    })

def evaluate_healthcare_constraints(data):
    """Evaluate constraints for the healthcare dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['Age'] < 0 or row['Age'] > 120:
            violations.append((idx, 'Invalid Age'))
        if row['BMI'] < 10 or row['BMI'] > 50:
            violations.append((idx, 'Invalid BMI'))
        if row['Cholesterol'] < 0:
            violations.append((idx, 'Invalid Cholesterol'))
    return violations

def generate_finance_dataset(n_samples=1000):
    """Generate a synthetic finance dataset for loan approval with constraints."""
    income = np.random.randint(20000, 150000, size=n_samples)
    credit_score = np.random.randint(300, 850, size=n_samples)
    marital_status = np.random.choice(["Single", "Married", "Divorced"], size=n_samples, p=[0.4, 0.5, 0.1])
    num_children = np.random.randint(0, 6, size=n_samples)

    # Enforce logical constraints
    for i in range(n_samples):
        if marital_status[i] == "Single":
            num_children[i] = 0

    # Target variable (loan approved)
    approved_loan = []
    for i in range(n_samples):
        prob = 0.4
        if income[i] > 50000 and credit_score[i] > 700:
            prob += 0.3
        if marital_status[i] == "Married":
            prob += 0.2
        if num_children[i] > 3:
            prob -= 0.2
        approved_loan.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "Income": income,
        "CreditScore": credit_score,
        "MaritalStatus": marital_status,
        "NumChildren": num_children,
        "ApprovedLoan": approved_loan
    })

def evaluate_finance_constraints(data):
    """Evaluate constraints for the finance dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['Income'] < 0:
            violations.append((idx, 'Negative Income'))
        if row['CreditScore'] < 300 or row['CreditScore'] > 850:
            violations.append((idx, 'Invalid Credit Score'))
        if row['MaritalStatus'] == 'Single' and row['NumChildren'] > 0:
            violations.append((idx, 'Single with Children'))
    return violations

def generate_quality_control_dataset(n_samples=1000):
    """Generate a synthetic manufacturing quality control dataset."""
    temperature = np.random.uniform(80, 120, size=n_samples)
    pressure = np.random.uniform(5, 15, size=n_samples)
    speed = np.random.randint(100, 300, size=n_samples)
    vibration = np.random.uniform(1, 10, size=n_samples)

    # Target variable (defective)
    defective = []
    for i in range(n_samples):
        prob = 0.6
        if temperature[i] > 100 and pressure[i] < 10:
            prob += 0.3
        if speed[i] > 200 and vibration[i] > 5:
            prob += 0.2
        defective.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "Temperature": temperature,
        "Pressure": pressure,
        "Speed": speed,
        "Vibration": vibration,
        "Defective": defective
    })

def evaluate_quality_control_constraints(data):
    """Evaluate constraints for the quality control dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['Temperature'] < 0 or row['Temperature'] > 150:
            violations.append((idx, 'Invalid Temperature'))
        if row['Pressure'] < 0 or row['Pressure'] > 20:
            violations.append((idx, 'Invalid Pressure'))
        if row['Speed'] < 0:
            violations.append((idx, 'Negative Speed'))
        if row['Vibration'] < 0:
            violations.append((idx, 'Negative Vibration'))
    return violations

def generate_fraud_detection_dataset(n_samples=1000):
    """Generate a synthetic e-commerce fraud detection dataset."""
    transaction_amount = np.random.uniform(10, 10000, size=n_samples)
    country = np.random.choice(["US", "UK", "India", "Germany"], size=n_samples)
    billing_country = np.random.choice(["US", "UK", "India", "Germany"], size=n_samples)
    transaction_time = np.random.uniform(0, 30, size=n_samples)  # Time since login in seconds

    # Target variable (fraudulent)
    fraudulent = []
    for i in range(n_samples):
        prob = 0.1
        if transaction_amount[i] > 5000 and country[i] != billing_country[i]:
            prob += 0.3
        if transaction_time[i] < 2:
            prob += 0.2
        fraudulent.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "TransactionAmount": transaction_amount,
        "Country": country,
        "BillingCountry": billing_country,
        "TransactionTime": transaction_time,
        "Fraudulent": fraudulent
    })

def evaluate_fraud_detection_constraints(data):
    """Evaluate constraints for the fraud detection dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['TransactionAmount'] < 0:
            violations.append((idx, 'Negative Transaction Amount'))
        if row['TransactionTime'] < 0:
            violations.append((idx, 'Negative Transaction Time'))
    return violations

def generate_energy_dataset(n_samples=1000):
    """Generate a synthetic smart grid anomaly detection dataset."""
    usage = np.random.uniform(0.5, 2.5, size=n_samples)  # Normalized usage (multiplier of baseline)
    baseline = np.random.uniform(100, 500, size=n_samples)  # Baseline energy usage
    voltage = np.random.uniform(190, 250, size=n_samples)

    # Target variable (anomalous)
    anomalous = []
    for i in range(n_samples):
        prob = 0.3
        if usage[i] > 1.5:
            prob += 0.3
        if voltage[i] > 240 or voltage[i] < 200:
            prob += 0.2
        anomalous.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "Usage": usage,
        "Baseline": baseline,
        "Voltage": voltage,
        "Anomalous": anomalous
    })

def evaluate_energy_constraints(data):
    """Evaluate constraints for the energy dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['Usage'] < 0:
            violations.append((idx, 'Negative Usage'))
        if row['Voltage'] < 190 or row['Voltage'] > 250:
            violations.append((idx, 'Invalid Voltage'))
    return violations

def generate_education_dataset(n_samples=1000):
    """Generate a synthetic education dataset for student performance prediction."""
    attendance = np.random.uniform(50, 100, size=n_samples)  # Attendance percentage
    study_hours = np.random.uniform(0, 20, size=n_samples)  # Weekly study hours
    grades = np.random.uniform(50, 100, size=n_samples)

    # Target variable (Pass/Fail)
    pass_fail = []
    for i in range(n_samples):
        prob = 0.55
        if attendance[i] > 90 and study_hours[i] > 10:
            prob += 0.4
        if grades[i] < 60:
            prob -= 0.3
        pass_fail.append(1 if np.random.rand() < prob else 0)

    return pd.DataFrame({
        "Attendance": attendance,
        "StudyHours": study_hours,
        "Grades": grades,
        "PassFail": pass_fail
    })

def evaluate_education_constraints(data):
    """Evaluate constraints for the education dataset."""
    violations = []
    for idx, row in data.iterrows():
        if row['Attendance'] < 0 or row['Attendance'] > 100:
            violations.append((idx, 'Invalid Attendance'))
        if row['StudyHours'] < 0:
            violations.append((idx, 'Negative Study Hours'))
        if row['Grades'] < 0 or row['Grades'] > 100:
            violations.append((idx, 'Invalid Grades'))
    return violations


In [2]:
data_healthcare = generate_healthcare_dataset()
violations_healthcare = evaluate_healthcare_constraints(data_healthcare)
violations_healthcare

[]

In [None]:
# Example Usage
data_healthcare = generate_healthcare_dataset()
data_finance = generate_finance_dataset()
data_quality_control = generate_quality_control_dataset()
data_fraud = generate_fraud_detection_dataset()
data_energy = generate_energy_dataset()
data_education = generate_education_dataset()

# Evaluate Constraints
violations_healthcare = evaluate_healthcare_constraints(data_healthcare)
violations_finance = evaluate_finance_constraints(data_finance)
violations_quality_control = evaluate_quality_control_constraints(data_quality_control)
violations_fraud = evaluate_fraud_detection_constraints(data_fraud)
violations_energy = evaluate_energy_constraints(data_energy)
violations_education = evaluate_education_constraints(data_education)

PATH = "experiment/data"
os.makedirs(PATH, exist_ok=True)
# Save to CSV
data_healthcare.to_csv(os.path.join(PATH, "healthcare_dataset.csv"), index=False)
data_finance.to_csv(os.path.join(PATH, "finance_dataset.csv"), index=False)
data_quality_control.to_csv(os.path.join(PATH, "quality_control_dataset.csv"), index=False)
data_fraud.to_csv(os.path.join(PATH, "fraud_detection_dataset.csv"), index=False)
data_energy.to_csv(os.path.join(PATH, "energy_dataset.csv"), index=False)
data_education.to_csv(os.path.join(PATH, "education_dataset.csv"), index=False)

print("Datasets generated and saved as CSV files.")
print("Violations:")
print("Healthcare:", violations_healthcare)
print("Finance:", violations_finance)
print("Quality Control:", violations_quality_control)
print("Fraud Detection:", violations_fraud)
print("Energy:", violations_energy)
print("Education:", violations_education)

Datasets generated and saved as CSV files.
Violations:
Healthcare: []
Finance: []
Quality Control: []
Fraud Detection: []
Energy: []
Education: []


In [11]:
import os
import pandas as pd
from imblearn.datasets import fetch_datasets

# Set path
PATH = "experiment/data"
os.makedirs(PATH, exist_ok=True)

# Get the metadata for all datasets
all_datasets = fetch_datasets()

# Loop through and fetch each dataset
for dataset_name in all_datasets:
    print(f"Fetching dataset: {dataset_name}")
    data = all_datasets[dataset_name].data  # Bunch object
    target = all_datasets[dataset_name].target  # Bunch object
    
    df = pd.DataFrame(data)
    df['target'] = target
    
    merged_path = os.path.join(PATH, f"{dataset_name}.csv")
    df.to_csv(merged_path, index=False)

    print(f"✅ Saved merged dataset: {merged_path}")


Fetching dataset: ecoli
✅ Saved merged dataset: experiment/data/ecoli.csv
Fetching dataset: optical_digits
✅ Saved merged dataset: experiment/data/optical_digits.csv
Fetching dataset: satimage
✅ Saved merged dataset: experiment/data/satimage.csv
Fetching dataset: pen_digits
✅ Saved merged dataset: experiment/data/pen_digits.csv
Fetching dataset: abalone
✅ Saved merged dataset: experiment/data/abalone.csv
Fetching dataset: sick_euthyroid
✅ Saved merged dataset: experiment/data/sick_euthyroid.csv
Fetching dataset: spectrometer
✅ Saved merged dataset: experiment/data/spectrometer.csv
Fetching dataset: car_eval_34
✅ Saved merged dataset: experiment/data/car_eval_34.csv
Fetching dataset: isolet
✅ Saved merged dataset: experiment/data/isolet.csv
Fetching dataset: us_crime
✅ Saved merged dataset: experiment/data/us_crime.csv
Fetching dataset: yeast_ml8
✅ Saved merged dataset: experiment/data/yeast_ml8.csv
Fetching dataset: scene
✅ Saved merged dataset: experiment/data/scene.csv
Fetching datas

In [None]:
import os
import pandas as pd
from imblearn.datasets import fetch_datasets


In [13]:
from ucimlrepo import fetch_ucirepo 
optical_recognition_of_handwritten_digits = fetch_ucirepo(id=80)
list(optical_recognition_of_handwritten_digits.data.features.columns)

['Attribute1',
 'Attribute2',
 'Attribute3',
 'Attribute4',
 'Attribute5',
 'Attribute6',
 'Attribute7',
 'Attribute8',
 'Attribute9',
 'Attribute10',
 'Attribute11',
 'Attribute12',
 'Attribute13',
 'Attribute14',
 'Attribute15',
 'Attribute16',
 'Attribute17',
 'Attribute18',
 'Attribute19',
 'Attribute20',
 'Attribute21',
 'Attribute22',
 'Attribute23',
 'Attribute24',
 'Attribute25',
 'Attribute26',
 'Attribute27',
 'Attribute28',
 'Attribute29',
 'Attribute30',
 'Attribute31',
 'Attribute32',
 'Attribute33',
 'Attribute34',
 'Attribute35',
 'Attribute36',
 'Attribute37',
 'Attribute38',
 'Attribute39',
 'Attribute40',
 'Attribute41',
 'Attribute42',
 'Attribute43',
 'Attribute44',
 'Attribute45',
 'Attribute46',
 'Attribute47',
 'Attribute48',
 'Attribute49',
 'Attribute50',
 'Attribute51',
 'Attribute52',
 'Attribute53',
 'Attribute54',
 'Attribute55',
 'Attribute56',
 'Attribute57',
 'Attribute58',
 'Attribute59',
 'Attribute60',
 'Attribute61',
 'Attribute62',
 'Attribute63',
 

In [9]:


data =fetch_datasets()['abalone'].data  # Bunch object
target =fetch_datasets()['abalone'].target  # Bunch object
df = pd.DataFrame(data)
df['target'] = target
df    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.0,0.0,1.0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,-1
1,0.0,0.0,1.0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,1
2,1.0,0.0,0.0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,-1
3,0.0,0.0,1.0,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,-1
4,0.0,1.0,0.0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1
...,...,...,...,...,...,...,...,...,...,...,...
4172,1.0,0.0,0.0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,-1
4173,0.0,0.0,1.0,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,-1
4174,0.0,0.0,1.0,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,-1
4175,1.0,0.0,0.0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,-1
