In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load dataset
file_path = "/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv"
dataset = pd.read_csv(file_path)

# Display basic dataset information
print("Dataset Shape:", dataset.shape)
print("Dataset Info:")
print(dataset.info())

# Check for missing values
missing_values = dataset.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

# Drop columns with more than 70% missing values
threshold = 0.7
dataset_cleaned = dataset.loc[:, dataset.isnull().mean() < threshold]

# Impute missing values
num_cols = dataset_cleaned.select_dtypes(include=["float64", "int64"]).columns
cat_cols = dataset_cleaned.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Apply imputations safely using .loc
dataset_cleaned.loc[:, num_cols] = num_imputer.fit_transform(dataset_cleaned[num_cols])
dataset_cleaned.loc[:, cat_cols] = cat_imputer.fit_transform(dataset_cleaned[cat_cols])

# Drop duplicate rows
dataset_cleaned.drop_duplicates(inplace=True)

# Clean column names
dataset_cleaned.columns = dataset_cleaned.columns.str.strip().str.lower()

# Encode categorical variables
ohe = OneHotEncoder(sparse_output=False, drop="first")  # Use sparse_output=False for newer sklearn versions
encoded_features = ohe.fit_transform(dataset_cleaned[cat_cols])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(cat_cols))

# Scale numerical features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(dataset_cleaned[num_cols])
scaled_df = pd.DataFrame(scaled_data, columns=num_cols)

# Combine scaled numerical and encoded categorical features
final_dataset = pd.concat([scaled_df, encoded_df], axis=1)

# Save the cleaned dataset to a file
output_path = "preprocessedfile.csv"
final_dataset.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")


Dataset Shape: (50191, 79)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50191 entries, 0 to 50190
Data columns (total 79 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    50191 non-null  object 
 1   year                                       50191 non-null  int64  
 2   iso_code                                   42262 non-null  object 
 3   population                                 41019 non-null  float64
 4   gdp                                        15251 non-null  float64
 5   cement_co2                                 28863 non-null  float64
 6   cement_co2_per_capita                      25358 non-null  float64
 7   co2                                        29137 non-null  float64
 8   co2_growth_abs                             26981 non-null  float64
 9   co2_growth_prct                            26002 non-

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_cleaned.drop_duplicates(inplace=True)


Cleaned dataset saved to: preprocessedfile.csv


In [3]:
! pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [8]:

! pip install xgboost




In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv'  # Adjust the file path if necessary
data = pd.read_csv(file_path)

# Select target and features
target_column = 'co2_per_unit_energy'  # Set 'co2_per_unit_energy' as the target
features = [
    'gdp', 'cement_co2_per_capita', 'co2_per_capita', 
    'coal_co2', 'consumption_co2', 'gas_co2'
]  # Select relevant columns

# Drop rows with missing values in target and feature columns
data = data.dropna(subset=[target_column] + features)

# Prepare features (X) and target (y)
X = data[features]
y = data[target_column]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Save the trained model
joblib.dump(model, "emission_predictor.pkl")

# Predict emissions for new data
new_data = np.array([[50000, 2.5, 5.0, 300, 400, 200]])  # Example input
prediction = model.predict(new_data)
print(f"Predicted Emissions: {prediction[0]}")


Mean Squared Error: 0.0004677680122358915
R2 Score: 0.8732260562941278
Predicted Emissions: 0.2253621220588684


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns
columns = [
    'gdp', 'cement_co2_per_capita', 'co2_per_capita', 
    'co2_per_unit_energy', 'coal_co2', 'consumption_co2', 'gas_co2'
]
data = data[columns].dropna()  # Drop rows with missing values

# Define target and features
target_column = 'co2_per_unit_energy'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature engineering: Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}
model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
import joblib
joblib.dump(best_model, "emission_predictor_tuned.pkl")

# Predict emissions for new data
new_data = np.array([[50000, 2.5, 5.0, 300, 400, 200]])  # Example input
new_data_poly = poly.transform(new_data)
new_data_scaled = scaler.transform(new_data_poly)
prediction = best_model.predict(new_data_scaled)
print(f"Predicted Emissions: {prediction[0]} tons")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 0.0003756078030487731
R2 Score: 0.8982032092113675
Cross-validation R2 scores: [0.8928083  0.87166522 0.91100376 0.85972202 0.89218733]
Mean CV R2 Score: 0.8854773268566785
Predicted Emissions: 0.19630463421344757 tons




In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns
columns = [
    'gdp', 
    'co2_per_unit_energy', 'coal_co2', 'consumption_co2', 
]
data = data[columns].dropna()  # Drop rows with missing values

# Define target and features
target_column = 'co2_per_unit_energy'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature engineering: Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Save the polynomial transformer
joblib.dump(poly, "poly_transform.pkl")
print("Polynomial transformer saved as 'poly_transform.pkl'.")

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Save the scaler
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as 'scaler.pkl'.")

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}
model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
joblib.dump(best_model, "emission_predictor_tuned.pkl")
print("Tuned model saved as 'emission_predictor_tuned.pkl'.")

# Predict emissions for new data 
new_data = np.array([[50000,40, 20]])  # Example input
new_data_poly = poly.transform(new_data)
new_data_scaled = scaler.transform(new_data_poly)
prediction = best_model.predict(new_data_scaled)
print(f"Predicted Emissions: {prediction[0]} tons")


Polynomial transformer saved as 'poly_transform.pkl'.
Scaler saved as 'scaler.pkl'.
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 0.0014739802847503973
R2 Score: 0.621494132530408
Cross-validation R2 scores: [0.6005901  0.62648354 0.55629089 0.61445597 0.6442117 ]
Mean CV R2 Score: 0.6084064390451489
Tuned model saved as 'emission_predictor_tuned.pkl'.
Predicted Emissions: 0.5458201766014099 tons




In [20]:
! pip install streamlit

Collecting streamlit
  Using cached streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Using cached GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Using cached pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting json

In [22]:
! pip install transformers datasets torch




In [None]:
from kafka import KafkaConsumer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json

# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Kafka consumer setup
kafka_topic = "real_time_emission_data"  # Replace with your Kafka topic name
bootstrap_servers = "localhost:9092"  # Replace with your Kafka server details
consumer = KafkaConsumer(
    kafka_topic,
    bootstrap_servers=bootstrap_servers,
    value_deserializer=lambda m: json.loads(m.decode("utf-8")),
    auto_offset_reset="earliest",
    group_id="llm_suggestions_group"
)

# Function to generate suggestions
def generate_suggestion(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Process incoming Kafka messages
print("Listening for real-time data...")
for message in consumer:
    real_time_data = message.value  # Assuming the producer sends JSON data
    print(f"Received data: {real_time_data}")

    # Construct the prompt dynamically
    prompt = (
        f"Data: GDP is {real_time_data['gdp']}, cement CO2 per capita is {real_time_data['cement_co2_per_capita']}, "
        f"CO2 per capita is {real_time_data['co2_per_capita']}, coal CO2 is {real_time_data['coal_co2']}, "
        f"gas CO2 is {real_time_data['gas_co2']}. Suggest actionable steps to reduce emissions."
    )

    
    suggestion = generate_suggestion(prompt)
    print(f"Generated Suggestion: {suggestion}")

    

2025-03-19 16:56:01.754689: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
import tensorflow as tf


print("TensorFlow version:", tf.__version__)


print("Available devices:", tf.config.list_physical_devices())


2025-03-19 17:19:59.828658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.11.0
Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [3]:
from kafka import KafkaConsumer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import joblib
import json

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


ml_model_path = "/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/emission_predictor_tuned.pkl"  # Path to the saved model
ml_model = joblib.load(ml_model_path)
scaler = joblib.load("/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/scaler.pkl")  # Ensure you save this after training
poly = joblib.load("/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/poly_transform.pkl")  # Save the PolynomialFeatures instance after training

# Kafka consumer setup
kafka_topic = "carbonfootprint"  # Replace with your Kafka topic name
bootstrap_servers = "localhost:9092"  # Replace with your Kafka server details
consumer = KafkaConsumer(
    kafka_topic,
    bootstrap_servers=bootstrap_servers,
    value_deserializer=lambda m: json.loads(m.decode("utf-8")),
    auto_offset_reset="earliest",
    group_id="llm_suggestions_group"
)

# Function to generate LLM suggestions
def generate_suggestion(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


print("Listening for real-time data...")
for message in consumer:
    real_time_data = message.value  
    print(f"Received data: {real_time_data}")

    
    try:
        features = [
            real_time_data["gdp"],
            real_time_data["cement_co2_per_capita"],
            real_time_data["co2_per_capita"],
            real_time_data["coal_co2"],
            real_time_data["gas_co2"]
        ]
        features = np.array([features])  

        
        features_poly = poly.transform(features)
        features_scaled = scaler.transform(features_poly)

        
        prediction = ml_model.predict(features_scaled)[0]

        
        prompt = (
            f"Data: GDP is {real_time_data['gdp']}, cement CO2 per capita is {real_time_data['cement_co2_per_capita']}, "
            f"CO2 per capita is {real_time_data['co2_per_capita']}, coal CO2 is {real_time_data['coal_co2']}, "
            f"gas CO2 is {real_time_data['gas_co2']}. The ML model predicts CO2 per unit energy to be {prediction:.2f} tons. "
            f"Suggest actionable steps to reduce emissions based on this data."
        )

        # Generate LLM suggestions
        suggestion = generate_suggestion(prompt)
        print(f"ML Prediction: {prediction:.2f} tons of CO2 per unit energy")
        print(f"Generated Suggestion: {suggestion}")

    except KeyError as e:
        print(f"Missing data in real-time message: {e}")

    except Exception as ex:
        print(f"An error occurred: {ex}")


Listening for real-time data...


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns
columns = [
    'gdp', 
    'co2_per_unit_energy', 'coal_co2', 'consumption_co2', 
]
data = data[columns].dropna()  # Drop rows with missing values

# Define target and features
target_column = 'co2_per_unit_energy'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature engineering: Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Save the polynomial transformer
joblib.dump(poly, "poly_transform.pkl")
print("Polynomial transformer saved as 'poly_transform.pkl'.")

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Save the scaler
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as 'scaler.pkl'.")

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}
model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
joblib.dump(best_model, "emission_predictor_tuned.pkl")
print("Tuned model saved as 'emission_predictor_tuned.pkl'.")




Polynomial transformer saved as 'poly_transform.pkl'.
Scaler saved as 'scaler.pkl'.
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 0.0014739802847503973
R2 Score: 0.621494132530408
Cross-validation R2 scores: [0.6005901  0.62648354 0.55629089 0.61445597 0.6442117 ]
Mean CV R2 Score: 0.6084064390451489
Tuned model saved as 'emission_predictor_tuned.pkl'.


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Downloads/owid-co2-data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns
columns = [
    'gdp', 
    'co2_per_unit_energy', 
    'coal_co2', 
    'consumption_co2',
]
data = data[columns].dropna()  # Drop rows with missing values

# Define target and features
target_column = 'co2_per_unit_energy'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature engineering: Adding polynomial features (degree=3 for better accuracy)
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Save the polynomial transformer
joblib.dump(poly, "poly_transform.pkl")
print("Polynomial transformer saved as 'poly_transform.pkl'.")

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Save the scaler
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as 'scaler.pkl'.")

# Hyperparameter tuning with extended GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 20],
}
model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R2 Score: {r2}")

# Cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=10, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the tuned model
joblib.dump(best_model, "emission_predictor_tuned.pkl")
print("Tuned model saved as 'emission_predictor_tuned.pkl'.")


Polynomial transformer saved as 'poly_transform.pkl'.
Scaler saved as 'scaler.pkl'.
Fitting 10 folds for each of 1458 candidates, totalling 14580 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}
Mean Squared Error: 0.0014851677721024338
Mean Absolute Error: 0.025614878264096405
R2 Score: 0.6186212789048897
Cross-validation R2 scores: [0.66328338 0.57391777 0.68482285 0.59750992 0.56702411 0.61401594
 0.58074209 0.65854196 0.66871269 0.71560508]
Mean CV R2 Score: 0.6324175794773613
Tuned model saved as 'emission_predictor_tuned.pkl'.


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/suggestion_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns and define the target
columns = ['CO2_ppm', 'NO2_ppm', 'SO2_ppm', 'Temperature_C', 'Humidity_%', 
           'Environmental_Score', 'Social_Score', 'Governance_Score', 'AQI']
data = data[columns].dropna()  # Drop rows with missing values

# Define target (e.g., AQI) and features
target_column = 'AQI'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with feature scaling and polynomial feature expansion
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBRegressor(random_state=42))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1.0],
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
joblib.dump(best_model, "aqi_predictor_tuned.pkl")
print("Model saved as 'aqi_predictor_tuned.pkl'")

# Predict AQI for new data
new_data = np.array([[400, 50, 25, 22, 65, 70, 80, 85]])  # Example input for prediction
prediction = best_model.predict(new_data)
print(f"Predicted AQI: {prediction[0]}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits


python(56093) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56094) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56095) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56096) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56097) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56098) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56099) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56100) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56101) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56105) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(56106) Malloc

Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}
Mean Squared Error: 0.03021131585977957
R2 Score: 0.997635359713968
Cross-validation R2 scores: [0.99777912 0.99616047 0.99699923 0.99694092 0.99823542]
Mean CV R2 Score: 0.9972230319786248
Model saved as 'aqi_predictor_tuned.pkl'
Predicted AQI: 18.362680435180664




In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/suggestion_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns and define the target
columns = ['CO2_ppm', 'NO2_ppm', 'SO2_ppm', 'Temperature_C', 'Humidity_%', 
           'Environmental_Score', 'Social_Score', 'Governance_Score', 'AQI']
data = data[columns].dropna()  # Drop rows with missing values

# Define target (e.g., AQI) and features
target_column = 'AQI'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with feature scaling and polynomial feature expansion
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBRegressor(random_state=42))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1.0],
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
joblib.dump(best_model, "aqi_predictor_tuned.pkl")
print("Model saved as 'aqi_predictor_tuned.pkl'")

# Save the individual components (scaler and polynomial transformer)
scaler = best_model.named_steps['scaler']
poly_transformer = best_model.named_steps['poly']

joblib.dump(scaler, "scaler.pkl")
joblib.dump(poly_transformer, "poly_transform.pkl")
print("Scaler and Polynomial Transformer saved as 'scaler.pkl' and 'poly_transform.pkl'")

# Predict AQI for new data
new_data = np.array([[400, 50, 25, 22, 65, 70, 80, 85]])  # Example input for prediction
new_data_poly = poly_transformer.transform(new_data)
new_data_scaled = scaler.transform(new_data_poly)
prediction = best_model.named_steps['model'].predict(new_data_scaled)
print(f"Predicted AQI: {prediction[0]}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits


python(60321) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60340) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60341) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60342) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60343) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60344) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60345) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60346) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60347) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60348) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(60349) Malloc

Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}
Mean Squared Error: 0.03021131585977957
R2 Score: 0.997635359713968
Cross-validation R2 scores: [0.99777912 0.99616047 0.99699923 0.99694092 0.99823542]
Mean CV R2 Score: 0.9972230319786248
Model saved as 'aqi_predictor_tuned.pkl'
Scaler and Polynomial Transformer saved as 'scaler.pkl' and 'poly_transform.pkl'
Predicted AQI: 18.362680435180664




In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/suggestion_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns and define the target
columns = ['CO2_ppm', 'NO2_ppm', 'SO2_ppm', 'Temperature_C', 'Humidity_%', 
           'Environmental_Score', 'Social_Score', 'Governance_Score', 'AQI']
data = data[columns].dropna()  # Drop rows with missing values

# Define target (e.g., AQI) and features
target_column = 'AQI'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simplified pipeline with reduced complexity
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Removed PolynomialFeatures for simplicity
    ('model', xgb.XGBRegressor(
        random_state=42, 
        reg_alpha=1,  # L1 regularization to reduce overfitting
        reg_lambda=1,  # L2 regularization to reduce overfitting
        n_jobs=-1
    ))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],  # Added column sampling for regularization
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the model
joblib.dump(best_model, "aqi_predictor_tuned.pkl")
print("Model saved as 'aqi_predictor_tuned.pkl'")

# Predict AQI for new data
new_data = np.array([[400, 50, 25, 22, 65, 70, 80, 85]])  # Example input for prediction
new_data_scaled = best_model.named_steps['scaler'].transform(new_data)
prediction = best_model.named_steps['model'].predict(new_data_scaled)
print(f"Predicted AQI: {prediction[0]}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits


python(13046) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13047) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13048) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13049) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13050) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13051) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13052) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13053) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13054) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13055) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(13056) Malloc

Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 1.0}
Mean Squared Error: 0.04267909681377063
R2 Score: 0.9966595062536929
Cross-validation R2 scores: [0.99670763 0.9947836  0.9969761  0.99680704 0.99676416]
Mean CV R2 Score: 0.996407705878319
Model saved as 'aqi_predictor_tuned.pkl'
Predicted AQI: 18.04737091064453




In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = '/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/suggestion_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Select relevant columns and define the target
columns = ['CO2_ppm', 'NO2_ppm', 'SO2_ppm', 'Temperature_C', 'Humidity_%', 
           'Environmental_Score', 'Social_Score', 'Governance_Score', 'AQI']
data = data[columns].dropna()  # Drop rows with missing values

# Define target (e.g., AQI) and features
target_column = 'AQI'
features = [col for col in columns if col != target_column]

X = data[features]
y = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline including PolynomialFeatures
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for feature scaling
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # PolynomialFeatures for interaction terms
    ('model', xgb.XGBRegressor(
        random_state=42, 
        reg_alpha=1,  # L1 regularization
        reg_lambda=1,  # L2 regularization
        n_jobs=-1
    ))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],  # Column sampling for regularization
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores)}")

# Save the scaler
scaler = best_model.named_steps['scaler']
joblib.dump(scaler, "/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/scaler.pkl")
print("Scaler saved as 'scaler.pkl'")

# Save the PolynomialFeatures transformer
poly_transformer = best_model.named_steps['poly']
joblib.dump(poly_transformer, "/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/poly_transform.pkl")
print("PolynomialFeatures transformer saved as 'poly_transform.pkl'")

# Save the model
xgb_model = best_model.named_steps['model']
joblib.dump(xgb_model, "/Users/kdn_aikothalavanya/Desktop/KPMG Projects/carbonfootprint/xgb_model.pkl")
print("Model saved as 'xgb_model.pkl'")

# Predict AQI for new data
new_data = np.array([[400, 50, 25, 22, 65, 70, 80, 85]])  # Example input for prediction
new_data_scaled = scaler.transform(new_data)  # Scale the new data
new_data_poly = poly_transformer.transform(new_data_scaled)  # Apply PolynomialFeatures
prediction = xgb_model.predict(new_data_poly)  # Predict AQI
print(f"Predicted AQI: {prediction[0]}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits


python(66373) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66374) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66375) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66376) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66377) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66384) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66385) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66399) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66400) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66401) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(66402) Malloc

Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 1.0}
Mean Squared Error: 0.08781916920396607
R2 Score: 0.993126391900658
Cross-validation R2 scores: [0.99442574 0.99220303 0.99503897 0.99510709 0.99441152]
Mean CV R2 Score: 0.9942372687021276
Scaler saved as 'scaler.pkl'
PolynomialFeatures transformer saved as 'poly_transform.pkl'
Model saved as 'xgb_model.pkl'
Predicted AQI: 17.456838607788086


