In [1]:
import pandas as pd

In [2]:
# Load the dataset
# file_path = "C://Users//ASUS//Downloads//global_water_consumption.csv"  
df = pd.read_csv("data.csv")

In [3]:
# View first few rows
df.head()

Unnamed: 0,Country,Year,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Water Scarcity Level,Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%)
0,Indonesia,2022,895.15,489.73,Low,20.78,13.75,34.99,1075.28,3.1
1,Indonesia,2024,502.89,311.95,High,48.51,8.44,32.88,2630.69,1.78
2,Spain,2000,843.39,440.09,Medium,25.16,31.7,34.62,2860.62,4.13
3,Canada,2021,803.34,478.98,High,45.74,6.13,18.99,1725.5,0.61
4,Brazil,2022,416.4,353.91,High,26.58,7.95,31.11,988.44,0.8


In [4]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Country                                           0
Year                                              0
Total Water Consumption (Billion Cubic Meters)    0
Per Capita Water Use (Liters per Day)             0
Water Scarcity Level                              0
Agricultural Water Use (%)                        0
Industrial Water Use (%)                          0
Household Water Use (%)                           0
Rainfall Impact (Annual Precipitation in mm)      0
Groundwater Depletion Rate (%)                    0
dtype: int64


In [5]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [6]:
# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
)

In [7]:
# Step 5: Clip and Normalize Percentage Columns
percentage_cols = ['agricultural_water_use_%', 'industrial_water_use_%', 'household_water_use_%']

# Clip values to ensure they are within 0–100%
df[percentage_cols] = df[percentage_cols].clip(0, 100)

# Normalize the percentage columns so they sum to 100%
df[percentage_cols] = (df[percentage_cols].div(df[percentage_cols].sum(axis=1), axis=0) * 100)


In [8]:
# Convert 'water_scarcity_level' to numeric values
scarcity_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['water_scarcity_level_numeric'] = df['water_scarcity_level'].map(scarcity_mapping)


In [9]:
# 1. Binary feature for high scarcity
df['is_high_scarcity'] = df['water_scarcity_level_numeric'] == 2

In [10]:
# Assign unique incremental codes for each country
country_mapping = {}
country_counter = 11  # Start from 11

def encode_country(country):
    global country_counter
    if country not in country_mapping:
        country_mapping[country] = country_counter
        country_counter += 1
    return country_mapping[country]

# Use lowercase 'country' as column name
df['country_code'] = df['country'].apply(encode_country)


In [11]:
# 2. Water use efficiency: total consumption / rainfall
df['water_use_efficiency'] = df['total_water_consumption_billion_cubic_meters'] / df['rainfall_impact_annual_precipitation_in_mm']

In [12]:
# 3. Total percentage of sectoral water use (should be around 100%)
df['total_sector_use_%'] = (
    df['agricultural_water_use_%'] +
    df['industrial_water_use_%'] +
    df['household_water_use_%']
)

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
# Define the numerical columns you want to scale
num_cols = [
    'total_water_consumption_billion_cubic_meters',
    'per_capita_water_use_liters_per_day',
    'rainfall_impact_annual_precipitation_in_mm',
    'groundwater_depletion_rate_%'
]

In [15]:
# Initialize the scaler
scaler = StandardScaler()

In [16]:
# Fit and transform the numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

In [17]:
# Display the scaled values
df[num_cols].head()

Unnamed: 0,total_water_consumption_billion_cubic_meters,per_capita_water_use_liters_per_day,rainfall_impact_annual_precipitation_in_mm,groundwater_depletion_rate_%
0,1.377109,1.645835,-0.564455,0.369538
1,-0.002003,0.276616,1.293606,-0.559027
2,1.195131,1.26352,1.568275,1.0941
3,1.054322,1.563041,0.212285,-1.382073
4,-0.306085,0.599782,-0.668192,-1.248416


In [18]:
print(df.head(10))

     country  year  total_water_consumption_billion_cubic_meters  \
0  Indonesia  2022                                      1.377109   
1  Indonesia  2024                                     -0.002003   
2      Spain  2000                                      1.195131   
3     Canada  2021                                      1.054322   
4     Brazil  2022                                     -0.306085   
5     Turkey  2018                                     -0.934079   
6     Turkey  2007                                     -0.321484   
7    Germany  2014                                      1.140882   
8      Spain  2020                                     -0.942587   
9     France  2014                                      0.090463   

   per_capita_water_use_liters_per_day water_scarcity_level  \
0                             1.645835                  Low   
1                             0.276616                 High   
2                             1.263520               Medium   