In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.combine import SMOTEENN  # SMOTE + Edited Nearest Neighbors
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt


In [4]:
from google.colab import files
uploaded = files.upload()

# You can access the uploaded file(s) by iterating through the 'uploaded' dictionary
# For example, to get the first uploaded file's name and content:
# for fn, content in uploaded.items():
#   print(f'User uploaded file "{fn}" with length {len(content)} bytes')

Saving Main Dataset.csv to Main Dataset.csv


In [5]:
df = pd.read_csv('Main Dataset.csv')


In [7]:
print("Original Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

Original Dataset Info:
Shape: (30, 7)
Columns: ['Grouo No', 'Leak Location 1', 'Leak Location 2', 'Inflow Rate (m³/s)', 'Pressure Drop (Pa)', 'Inflow velocity (m/s)', 'Outlet pressure (Pa)']

First few rows:
   Grouo No  Leak Location 1  Leak Location 2 Inflow Rate (m³/s)  \
0         1              0.4             0.75         3.13 ×10⁻⁴   
1         2              0.4             0.75         3.23 ×10⁻⁴   
2         3              0.4             0.75         3.66 ×10⁻⁴   
3         4              0.6             0.86         3.21 ×10⁻⁴   
4         5              0.6             0.86         3.44 ×10⁻⁴   

  Pressure Drop (Pa)  Inflow velocity (m/s) Outlet pressure (Pa)  
0              1,955                   1.26               15,340  
1              2,028                   1.30               13,448  
2              2,657                   1.47                5,367  
3              2,112                   1.30               13,244  
4              2,450                   1.39     

In [8]:

# Clean the data - handle the scientific notation and commas
def clean_numeric_data(df):
    """Clean numeric columns with scientific notation and commas"""
    df_clean = df.copy()

    # Clean column names
    df_clean.columns = df_clean.columns.str.strip()

    # Convert scientific notation in Inflow Rate
    if 'Inflow Rate (m³/s)' in df_clean.columns:
        # Handle both '×' and spaces in scientific notation
        df_clean['Inflow Rate (m³/s)'] = (df_clean['Inflow Rate (m³/s)']
                                         .astype(str)
                                         .str.replace('×', 'e')
                                         .str.replace(' e', 'e')  # Remove space before e
                                         .str.replace('e-', 'e-'))  # Ensure proper e- format
        df_clean['Inflow Rate (m³/s)'] = pd.to_numeric(df_clean['Inflow Rate (m³/s)'], errors='coerce')

    # Remove commas from numeric columns and convert to float
    numeric_columns = ['Pressure Drop (Pa)', 'Outlet pressure (Pa)']
    for col in numeric_columns:
        if col in df_clean.columns:
            df_clean[col] = (df_clean[col]
                           .astype(str)
                           .str.replace(',', '')
                           .str.replace('"', ''))  # Remove any quotes
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Also clean other numeric columns
    other_numeric = ['Inflow velocity (m/s)']
    for col in other_numeric:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    return df_clean

In [9]:
# Clean the dataset
df_clean = clean_numeric_data(df)


In [10]:
# Handle any remaining NaN values
if df_clean.isnull().any().any():
    print("\n⚠️  Found NaN values. Handling them...")

    # For numeric columns, fill NaN with median
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().any():
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"  Filled NaN in '{col}' with median: {median_val}")

print("\nFinal data check:")
print("Missing values after cleaning:", df_clean.isnull().sum().sum())
print("Data shape:", df_clean.shape)


⚠️  Found NaN values. Handling them...
  Filled NaN in 'Inflow Rate (m³/s)' with median: nan

Final data check:
Missing values after cleaning: 30
Data shape: (30, 7)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(median_val, inplace=True)


In [13]:
# Install required packages first:
# pip install imbalanced-learn scikit-learn numpy pandas

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.combine import SMOTEENN  # SMOTE + Edited Nearest Neighbors
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv('Main Dataset.csv')

print("Original Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

# Clean the data - handle the scientific notation and commas
def clean_numeric_data(df):
    """Clean numeric columns with scientific notation and commas"""
    df_clean = df.copy()

    # Clean column names
    df_clean.columns = df_clean.columns.str.strip()

    # Debug: Print the original inflow rate values
    if 'Inflow Rate (m³/s)' in df_clean.columns:
        print("Original inflow rate values (first 5):")
        print(df_clean['Inflow Rate (m³/s)'].head().values)

        # Convert scientific notation in Inflow Rate - step by step
        inflow_col = df_clean['Inflow Rate (m³/s)'].astype(str)
        print("\nAfter converting to string:")
        print(inflow_col.head().values)

        # Replace the scientific notation symbols - handle Unicode superscripts
        inflow_col = inflow_col.str.replace('×10⁻', 'e-')  # Handle ×10⁻ format
        inflow_col = inflow_col.str.replace('×', 'e')      # Handle × format
        inflow_col = inflow_col.str.replace(' e', 'e')     # Remove space before e

        # Handle Unicode superscript numbers
        superscript_map = {
            '⁰': '0', '¹': '1', '²': '2', '³': '3', '⁴': '4',
            '⁵': '5', '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9'
        }
        for super_char, regular_char in superscript_map.items():
            inflow_col = inflow_col.str.replace(super_char, regular_char)

        print("\nAfter replacing scientific notation and superscripts:")
        print(inflow_col.head().values)

        # Convert to numeric
        df_clean['Inflow Rate (m³/s)'] = pd.to_numeric(inflow_col, errors='coerce')
        print("\nAfter converting to numeric:")
        print(df_clean['Inflow Rate (m³/s)'].head().values)

    # Remove commas from numeric columns and convert to float
    numeric_columns = ['Pressure Drop (Pa)', 'Outlet pressure (Pa)']
    for col in numeric_columns:
        if col in df_clean.columns:
            df_clean[col] = (df_clean[col]
                           .astype(str)
                           .str.replace(',', '')
                           .str.replace('"', ''))  # Remove any quotes
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Also clean other numeric columns
    other_numeric = ['Inflow velocity (m/s)']
    for col in other_numeric:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    return df_clean

# Clean the dataset
df_clean = clean_numeric_data(df)

# Check for missing values and handle them
print("\n" + "="*60)
print("Cleaned Dataset:")
print(df_clean.dtypes)
print("\nMissing values per column:")
print(df_clean.isnull().sum())
print("\nCleaned data sample:")
print(df_clean.head())

# Handle any remaining NaN values
if df_clean.isnull().any().any():
    print("\n⚠️  Found NaN values. Handling them...")

    # For numeric columns, fill NaN with median (but only if median is not NaN)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().any():
            median_val = df_clean[col].median()
            if not pd.isna(median_val):
                df_clean.loc[:, col] = df_clean[col].fillna(median_val)
                print(f"  Filled NaN in '{col}' with median: {median_val}")
            else:
                print(f"  ⚠️  Cannot fill '{col}' - all values are NaN!")
                # If all values are NaN, there's a data cleaning issue
                print(f"     Original values in {col}:")
                print(f"     {df[col].head().values}")

print("\nFinal data check:")
print("Missing values after cleaning:", df_clean.isnull().sum().sum())
print("Data shape:", df_clean.shape)

# Prepare features and target
# For this example, let's create categories based on pressure ranges as target
# You can modify this based on your specific prediction task

# Method 1: Create pressure categories as target variable
df_clean['Pressure_Category'] = pd.cut(
    df_clean['Outlet pressure (Pa)'],
    bins=3,
    labels=['Low', 'Medium', 'High']
)

# Prepare feature matrix (X) and target vector (y)
feature_columns = [
    'Leak Location 1', 'Leak Location 2', 'Inflow Rate (m³/s)',
    'Pressure Drop (Pa)', 'Inflow velocity (m/s)'
]

X = df_clean[feature_columns].values
y = df_clean['Pressure_Category']

# Check for any remaining NaN values in features or target
print(f"\nFeature matrix shape: {X.shape}")
print(f"NaN values in features: {np.isnan(X).sum()}")
print(f"NaN values in target: {y.isnull().sum()}")

# Remove any rows with NaN values if they still exist
if np.isnan(X).any() or y.isnull().any():
    print("⚠️  Removing rows with NaN values...")
    mask = ~(np.isnan(X).any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]
    print(f"Final data shape after removing NaN: {X.shape}")

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"\nOriginal class distribution: {Counter(y_encoded)}")
print(f"Class labels: {dict(zip(le.classes_, range(len(le.classes_))))}")

# Validate we have enough samples for SMOTE
min_class_size = min(Counter(y_encoded).values())
if min_class_size < 2:
    raise ValueError(f"Minimum class size is {min_class_size}. Need at least 2 samples per class for SMOTE.")

print("\n" + "="*60)

# SMOTE Implementation for your dataset

# 1. Standard SMOTE
print("1. APPLYING STANDARD SMOTE")

# For small datasets, use smaller k_neighbors
min_class_size = min(Counter(y_encoded).values())
k_neighbors = min(3, min_class_size - 1) if min_class_size > 1 else 1

smote = SMOTE(
    sampling_strategy='auto',  # Balance all classes
    k_neighbors=k_neighbors,
    random_state=42
)

X_smote, y_smote = smote.fit_resample(X, y_encoded)

print(f"After SMOTE:")
print(f"  Original shape: {X.shape}")
print(f"  Augmented shape: {X_smote.shape}")
print(f"  Original distribution: {Counter(y_encoded)}")
print(f"  Augmented distribution: {Counter(y_smote)}")

# 2. SMOTE with Edited Nearest Neighbors (SMOTE-ENN)
print("\n2. APPLYING SMOTE + EDITED NEAREST NEIGHBORS")

smote_enn = SMOTEENN(
    smote=SMOTE(k_neighbors=k_neighbors, random_state=42),
    random_state=42
)

X_smote_enn, y_smote_enn = smote_enn.fit_resample(X, y_encoded)

print(f"After SMOTE-ENN:")
print(f"  Shape: {X_smote_enn.shape}")
print(f"  Distribution: {Counter(y_smote_enn)}")

# 3. Custom SMOTE with specific augmentation
print("\n3. CUSTOM SMOTE FOR SPECIFIC AUGMENTATION")

# Let's say you want 100 samples total
target_samples_per_class = 35  # This will give you ~105 total samples

smote_custom = SMOTE(
    sampling_strategy={i: target_samples_per_class for i in range(len(le.classes_))},
    k_neighbors=k_neighbors,
    random_state=42
)

X_smote_custom, y_smote_custom = smote_custom.fit_resample(X, y_encoded)

print(f"After Custom SMOTE:")
print(f"  Shape: {X_smote_custom.shape}")
print(f"  Distribution: {Counter(y_smote_custom)}")

# Convert back to DataFrame for easier analysis
def create_augmented_dataframe(X_aug, y_aug, feature_columns, label_encoder):
    """Convert augmented arrays back to DataFrame"""
    df_aug = pd.DataFrame(X_aug, columns=feature_columns)
    df_aug['Pressure_Category'] = label_encoder.inverse_transform(y_aug)
    df_aug['Group_No'] = range(1, len(df_aug) + 1)  # New group numbers

    # Reorder columns to match original
    column_order = ['Group_No'] + feature_columns + ['Pressure_Category']
    df_aug = df_aug[column_order]

    return df_aug

# Create augmented datasets
df_smote = create_augmented_dataframe(X_smote, y_smote, feature_columns, le)
df_smote_custom = create_augmented_dataframe(X_smote_custom, y_smote_custom, feature_columns, le)

print("\n" + "="*60)
print("AUGMENTED DATASET SAMPLES:")
print("\nFirst 10 rows of SMOTE augmented dataset:")
print(df_smote.head(10))

# Save the augmented datasets
df_smote.to_csv('Dataset_SMOTE_Augmented.csv', index=False)
df_smote_custom.to_csv('Dataset_SMOTE_Custom_Augmented.csv', index=False)

print(f"\n✅ Saved augmented datasets:")
print(f"   - Dataset_SMOTE_Augmented.csv ({len(df_smote)} samples)")
print(f"   - Dataset_SMOTE_Custom_Augmented.csv ({len(df_smote_custom)} samples)")

# Alternative approach: Regression-based augmentation
print("\n" + "="*60)
print("4. ALTERNATIVE: CONTINUOUS TARGET APPROACH")
print("If you want to predict a continuous variable (like outlet pressure):")

# Use outlet pressure as continuous target
X_features = df_clean[feature_columns[:-1]].values  # Exclude outlet pressure from features
y_continuous = df_clean['Outlet pressure (Pa)'].values

# Create discrete bins for SMOTE
y_binned = pd.cut(y_continuous, bins=5, labels=False)

smote_continuous = SMOTE(
    sampling_strategy='auto',
    k_neighbors=min(2, min(Counter(y_binned).values()) - 1),
    random_state=42
)

X_cont_smote, y_cont_smote = smote_continuous.fit_resample(X_features, y_binned)

print(f"Continuous approach - Shape: {X_cont_smote.shape}")
print(f"Continuous approach - Bin distribution: {Counter(y_cont_smote)}")

print("\n" + "="*60)
print("RECOMMENDATIONS FOR YOUR DATASET:")
print("1. Use k_neighbors=2 or 3 for small datasets (30 samples)")
print("2. Consider the physical meaning of synthetic samples")
print("3. Validate synthetic samples make physical sense")
print("4. Use cross-validation to assess model performance")
print("5. Consider domain-specific constraints when generating synthetic data")

# Function for easy reuse
def smote_augment_fluid_data(csv_file, target_column_type='pressure_categories', n_samples=100):
    """
    Easy function to apply SMOTE to your fluid dynamics data

    Parameters:
    csv_file: path to your CSV file
    target_column_type: 'pressure_categories' or 'custom'
    n_samples: target number of total samples
    """

    df = pd.read_csv(csv_file)
    df_clean = clean_numeric_data(df)

    # Define features
    feature_cols = [
        'Leak Location 1', 'Leak Location 2', 'Inflow Rate (m³/s)',
        'Pressure Drop (Pa)', 'Inflow velocity (m/s)'
    ]

    X = df_clean[feature_cols].values

    if target_column_type == 'pressure_categories':
        y = pd.cut(df_clean['Outlet pressure (Pa)'], bins=3, labels=False)

    # Apply SMOTE
    smote = SMOTE(
        sampling_strategy='auto',
        k_neighbors=min(3, len(df_clean) // 4),
        random_state=42
    )

    X_aug, y_aug = smote.fit_resample(X, y)

    return X_aug, y_aug

print("\n✅ Complete! Check the generated CSV files for your augmented datasets.")

Original Dataset Info:
Shape: (30, 7)
Columns: ['Grouo No', 'Leak Location 1', 'Leak Location 2', 'Inflow Rate (m³/s)', 'Pressure Drop (Pa)', 'Inflow velocity (m/s)', 'Outlet pressure (Pa)']

First few rows:
   Grouo No  Leak Location 1  Leak Location 2 Inflow Rate (m³/s)  \
0         1              0.4             0.75         3.13 ×10⁻⁴   
1         2              0.4             0.75         3.23 ×10⁻⁴   
2         3              0.4             0.75         3.66 ×10⁻⁴   
3         4              0.6             0.86         3.21 ×10⁻⁴   
4         5              0.6             0.86         3.44 ×10⁻⁴   

  Pressure Drop (Pa)  Inflow velocity (m/s) Outlet pressure (Pa)  
0              1,955                   1.26               15,340  
1              2,028                   1.30               13,448  
2              2,657                   1.47                5,367  
3              2,112                   1.30               13,244  
4              2,450                   1.39     

In [14]:
import os
print("Current working directory:", os.getcwd())

Current working directory: /content
