<a href="https://colab.research.google.com/github/Giganticshrek/FYP/blob/main/Generatedata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_maintenance_recommendation(ph, turbidity, tds, temp):
    """Generate maintenance recommendations based on rules"""
    recommendations = []

    # pH-based recommendations
    if ph < 6.5:
        recommendations.append("Add pH Neutralizer/Alkaline Treatment")
        recommendations.append("Inspect for Corrosion Damage")
    elif ph > 8.5:
        recommendations.append("Add Acid Treatment to Lower pH")
        recommendations.append("Check for Scale Formation")

    # Turbidity-based recommendations
    if turbidity > 100:
        recommendations.append("Replace/Clean Pre-filters")
        recommendations.append("Flush Pipeline System")
    elif turbidity > 50:
        recommendations.append("Schedule Filter Inspection")

    # TDS-based recommendations
    if tds > 1500:
        recommendations.append("Descaling Treatment Needed")
        recommendations.append("Consider Water Softener")
    elif tds > 1000:
        recommendations.append("Monitor for Mineral Buildup")

    # Temperature-based recommendations
    if temp > 35:
        recommendations.append("High Temp Accelerates Corrosion")
        recommendations.append("Install Cooling System")

    if not recommendations:
        recommendations.append("All Parameters Normal")
        recommendations.append("Continue Routine Monitoring")

    return " | ".join(recommendations)

def assess_corrosion_risk(ph, tds):
    """Assess corrosion risk level"""
    if ph < 5.5:
        return "CRITICAL"
    elif ph < 6.5:
        if tds > 1000:
            return "HIGH"
        else:
            return "MODERATE"
    elif ph > 9.5:
        return "HIGH"
    elif 6.5 <= ph <= 8.5:
        if tds > 1500:
            return "MODERATE"
        else:
            return "LOW"
    else:
        return "LOW"

def assess_scaling_risk(tds, ph, turbidity):
    """Assess scaling/buildup risk"""
    if tds > 1500 and ph > 8.0:
        return "CRITICAL"
    elif tds > 1000:
        if ph > 8.5:
            return "HIGH"
        else:
            return "MODERATE"
    elif turbidity > 100:
        return "HIGH"
    elif tds > 500 or turbidity > 50:
        return "LOW"
    else:
        return "NONE"

def assess_pipeline_health(ph, turbidity, temp, tds):
    """Calculate overall pipeline health score"""
    score = 0

    # pH scoring (40 points max)
    if 6.5 <= ph <= 8.5:
        score += 40
    elif 6.0 <= ph <= 9.0:
        score += 25
    elif 5.5 <= ph <= 9.5:
        score += 10

    # Turbidity scoring (25 points max)
    if turbidity < 5:
        score += 25
    elif turbidity < 25:
        score += 15
    elif turbidity < 50:
        score += 5

    # Temperature scoring (20 points max)
    if 15 <= temp <= 25:
        score += 20
    elif 10 <= temp <= 35:
        score += 10

    # TDS scoring (15 points max)
    if tds < 500:
        score += 15
    elif tds < 1000:
        score += 10
    elif tds < 1500:
        score += 5

    # Convert to status
    if score >= 85:
        return "EXCELLENT"
    elif score >= 65:
        return "GOOD"
    elif score >= 45:
        return "WARNING"
    else:
        return "CRITICAL"

def generate_realistic_data(n_samples=3000):
    """Generate realistic water quality data"""
    data = []

    for i in range(n_samples):
        # Define water condition scenarios with different distributions
        scenario = random.choices(
            ['excellent', 'good', 'moderate', 'poor', 'critical'],
            weights=[0.25, 0.30, 0.25, 0.15, 0.05]
        )[0]

        if scenario == 'excellent':
            # Clean, optimal water
            ph = np.random.normal(7.2, 0.3)
            turbidity = np.random.exponential(2)
            tds = np.random.normal(250, 50)
            temp = np.random.normal(22, 2)

        elif scenario == 'good':
            # Acceptable water
            ph = np.random.normal(7.0, 0.5)
            turbidity = np.random.exponential(10)
            tds = np.random.normal(450, 100)
            temp = np.random.normal(24, 3)

        elif scenario == 'moderate':
            # Some issues present
            ph = np.random.choice([
                np.random.normal(6.2, 0.3),  # Slightly acidic
                np.random.normal(8.8, 0.3)   # Slightly alkaline
            ])
            turbidity = np.random.exponential(40)
            tds = np.random.normal(900, 200)
            temp = np.random.normal(28, 4)

        elif scenario == 'poor':
            # Multiple problematic parameters
            ph = np.random.choice([
                np.random.normal(5.8, 0.4),  # Acidic
                np.random.normal(9.2, 0.4)   # Alkaline
            ])
            turbidity = np.random.exponential(80)
            tds = np.random.normal(1300, 200)
            temp = np.random.normal(32, 4)

        else:  # critical
            # Severe problems
            ph = np.random.choice([
                np.random.normal(5.2, 0.3),  # Very acidic
                np.random.normal(9.8, 0.3)   # Very alkaline
            ])
            turbidity = np.random.exponential(150)
            tds = np.random.normal(1700, 200)
            temp = np.random.normal(37, 3)

        # Clip values to realistic ranges
        ph = np.clip(ph, 4.0, 11.0)
        turbidity = np.clip(turbidity, 0, 500)
        tds = np.clip(tds, 50, 2500)
        temp = np.clip(temp, 10, 45)

        # Generate labels and recommendations
        corrosion_risk = assess_corrosion_risk(ph, tds)
        scaling_risk = assess_scaling_risk(tds, ph, turbidity)
        pipeline_status = assess_pipeline_health(ph, turbidity, temp, tds)
        maintenance = generate_maintenance_recommendation(ph, turbidity, tds, temp)

        # Add some noise for realism
        ph = round(ph + np.random.normal(0, 0.05), 2)
        turbidity = round(turbidity + np.random.normal(0, 1), 2)
        tds = round(tds + np.random.normal(0, 5), 2)
        temp = round(temp + np.random.normal(0, 0.2), 2)

        data.append({
            'pH': ph,
            'Turbidity_NTU': turbidity,
            'TDS_PPM': tds,
            'Temperature_C': temp,
            'Corrosion_Risk': corrosion_risk,
            'Scaling_Risk': scaling_risk,
            'Pipeline_Status': pipeline_status,
            'Maintenance_Recommendation': maintenance
        })

    return pd.DataFrame(data)

# Generate the dataset
print("Generating 3000 water quality samples...")
df = generate_realistic_data(3000)

# Display statistics
print("\n" + "="*60)
print("DATASET GENERATED SUCCESSFULLY!")
print("="*60)
print(f"\nTotal samples: {len(df)}")
print("\nFirst 10 samples:")
print(df.head(10))

print("\n" + "="*60)
print("DATASET STATISTICS")
print("="*60)
print("\nNumerical Features:")
print(df[['pH', 'Turbidity_NTU', 'TDS_PPM', 'Temperature_C']].describe())

print("\n" + "="*60)
print("LABEL DISTRIBUTIONS")
print("="*60)
print("\nPipeline Status:")
print(df['Pipeline_Status'].value_counts())

print("\nCorrosion Risk:")
print(df['Corrosion_Risk'].value_counts())

print("\nScaling Risk:")
print(df['Scaling_Risk'].value_counts())

# Save to CSV
filename = 'water_quality_training_data.csv'
df.to_csv(filename, index=False)
print(f"\n✓ Dataset saved as '{filename}'")
print(f"✓ Ready for machine learning training!")
print("\nYou can now use this data to train your ML model.")

Generating 3000 water quality samples...

DATASET GENERATED SUCCESSFULLY!

Total samples: 3000

First 10 samples:
     pH  Turbidity_NTU  TDS_PPM  Temperature_C Corrosion_Risk Scaling_Risk  \
0  6.34          59.58   656.01          36.48       MODERATE          LOW   
1  7.03           2.03   257.62          22.09            LOW         NONE   
2  6.64           5.65   446.09          25.06            LOW         NONE   
3  7.17           2.10   285.10          23.48            LOW         NONE   
4  8.98          30.10   759.97          22.66            LOW          LOW   
5  6.13          34.40   750.79          28.01       MODERATE          LOW   
6  5.22         116.51  1124.15          31.14       CRITICAL     MODERATE   
7  7.07           1.79   292.43          24.58            LOW         NONE   
8  7.13          14.34   603.50          33.83            LOW          LOW   
9  7.59           0.90   234.97          22.61            LOW         NONE   

  Pipeline_Status          