# Optimized Data Analysis and Modeling

### STEP 1 - IMPORT LIBRARIES

In [8]:

# Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings

# Setting display options
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


Libraries imported successfully!


### STEP 2 - LOAD AND INSPECT THE DATASET

In [9]:

# Load dataset (modifying the path to a dynamic one)
df = pd.read_csv("data.csv")

# Display first few rows and dataset info
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Country                                         5000 non-null   object 
 1   Year                                            5000 non-null   int64  
 2   Total Water Consumption (Billion Cubic Meters)  5000 non-null   float64
 3   Per Capita Water Use (Liters per Day)           5000 non-null   float64
 4   Water Scarcity Level                            5000 non-null   object 
 5   Agricultural Water Use (%)                      5000 non-null   float64
 6   Industrial Water Use (%)                        5000 non-null   float64
 7   Household Water Use (%)                         5000 non-null   float64
 8   Rainfall Impact (Annual Precipitation in mm)    5000 non-null   float64
 9   Groundwater Depletion Rate (%)           

(     Country  Year  Total Water Consumption (Billion Cubic Meters)  \
 0  Indonesia  2022                                          895.15   
 1  Indonesia  2024                                          502.89   
 2      Spain  2000                                          843.39   
 3     Canada  2021                                          803.34   
 4     Brazil  2022                                          416.40   
 
    Per Capita Water Use (Liters per Day) Water Scarcity Level  \
 0                                 489.73                  Low   
 1                                 311.95                 High   
 2                                 440.09               Medium   
 3                                 478.98                 High   
 4                                 353.91                 High   
 
    Agricultural Water Use (%)  Industrial Water Use (%)  \
 0                       20.78                     13.75   
 1                       48.51                      8.

### STEP 3 - DATA OVERVIEW AND INFORMATION

In [10]:

# Checking for missing values, basic stats, and columns
df.describe(), df.isnull().sum()


(              Year  Total Water Consumption (Billion Cubic Meters)  \
 count  5000.000000                                     5000.000000   
 mean   2012.204400                                      503.459606   
 std       7.205484                                      284.457947   
 min    2000.000000                                       10.050000   
 25%    2006.000000                                      259.677500   
 50%    2012.000000                                      499.515000   
 75%    2018.000000                                      751.797500   
 max    2024.000000                                      999.680000   
 
        Per Capita Water Use (Liters per Day)  Agricultural Water Use (%)  \
 count                            5000.000000                 5000.000000   
 mean                              276.034040                   50.281704   
 std                               129.853417                   17.397782   
 min                                50.110000      

### STEP 4 - DATA PREPROCESSING

In [11]:
'''
# Data preprocessing pipeline (handling missing data and scaling)
X = df.drop('target_column', axis=1)  # Replace 'target_column' with actual column name
y = df['target_column']

# Data Preprocessing: Impute missing values and scale the data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Impute missing values and scale numeric features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

# Apply transformations to the numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Fit and transform the features
X_processed = preprocessor.fit_transform(X)'''

# Correcting the code for preprocessing
target_column = 'Total Water Consumption (Billion Cubic Meters)'  # This is the target variable

# Features (X) and target (y)
X = df.drop(target_column, axis=1)
y = df[target_column]

# Data Preprocessing: Impute missing values and scale the data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Select numeric features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Apply transformations to the numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Fit and transform the features
X_processed = preprocessor.fit_transform(X)



### STEP 5 - MODEL SELECTION AND EVALUATION

In [12]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Create models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train, evaluate and compare models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Display results
pd.DataFrame(results)


Unnamed: 0,Linear Regression,Random Forest,Gradient Boosting
MSE,78829.51228,82000.678621,81367.361505
RMSE,280.765939,286.357606,285.249648
R2,0.000886,-0.039306,-0.031279
