# **Installs**

In [None]:
!pip install tensorflow-datasets==4.8 --quiet
!pip install tensorflow==2.15.0 --quiet #15 13
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-text==2.15.0 --quiet #15
!pip install transformers==4.17 --quiet
!pip install pydot --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Imports**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

In [None]:
from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
import pandas as pd

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# **Data Load**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = '/content/drive/MyDrive/capstone_modeling/data'

In [None]:
model_name =  'bert-base-cased'

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name, output_hidden_states = True)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

In [None]:
train_df = pd.read_csv(f"{DATA_PATH}/train_dataset_w59types.csv")

unique_counts = train_df.apply(lambda x: x.nunique())
print("\nUnique Value Counts:")
print(unique_counts)

duplicate_projects = train_df[train_df.duplicated('project_name', keep=False)]
duplicate_projects

type_df = train_df["type"]
type_counts = type_df.value_counts().reset_index()
type_counts.columns = ["type", 'count']

type_counts


Unique Value Counts:
project_id                                7234
project_name                              7168
methodologyorprotocol                      292
fully_harmonized_methodologyorprotocol     277
region                                       8
voluntary_registry                           4
type_from_registry                          99
project_developer                         2512
arborwa_project                              3
type                                        59
rule-based_type_mapping_first               50
dtype: int64


Unnamed: 0,type,count
0,Cookstoves,1147
1,Wind,811
2,Improved Forest Management,605
3,Hydropower,423
4,Afforestation/Reforestation,378
5,Manure Methane Digester,359
6,Clean Water,330
7,Landfill Methane,306
8,Solar - Centralized,258
9,Rice Emission Reductions,234


In [None]:
'''
# Projects with types lower than 5 counts
type_counts = train_df["type"].value_counts()

# Identify the types with fewer than 5 counts
types_to_drop = type_counts[type_counts < 2].index
print(types_to_drop.shape)
# Drop the records with these types from train_df
train_df_new = train_df[~train_df["type"].isin(types_to_drop)]
print(train_df_new.shape)
'''

'\n# Projects with types lower than 5 counts\ntype_counts = train_df["type"].value_counts()\n\n# Identify the types with fewer than 5 counts\ntypes_to_drop = type_counts[type_counts < 2].index\nprint(types_to_drop.shape)\n# Drop the records with these types from train_df\ntrain_df_new = train_df[~train_df["type"].isin(types_to_drop)]\nprint(train_df_new.shape)\n'

In [None]:
X_train_df = train_df.drop(columns=['type'])
Y_train_df = train_df['type']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val  = train_test_split(X_train_df,Y_train_df, test_size=0.1, stratify =Y_train_df,  random_state=42)
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape )

(6510, 10) (724, 10) (6510,) (724,)


In [None]:
len(Y_train.unique())

59

In [None]:
train_proj_ids = list(X_train.project_id)

In [None]:
val_proj_ids = list(X_val.project_id)

In [None]:
data_preprocessed = pd.read_csv(f'{DATA_PATH}/projects_table_rules_final_ml_roshni.csv')

In [None]:
data_preprocessed_no_small_classes = data_preprocessed[data_preprocessed.project_id.isin(train_proj_ids + val_proj_ids)]

In [None]:
target_names = list(np.unique(data_preprocessed_no_small_classes.sort_values(['vrod_v11_type'])['vrod_v11_type']))
target_labels = {target_names[i]:i for i in range(len(target_names))}

def generate_integer_labels(df):
  df['type_label'] = df['vrod_v11_type'].map(target_labels)
  return df

data_preprocessed_no_small_classes = generate_integer_labels(data_preprocessed_no_small_classes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['type_label'] = df['vrod_v11_type'].map(target_labels)


In [None]:
data_train = data_preprocessed_no_small_classes[data_preprocessed_no_small_classes.project_id.isin(train_proj_ids)]

In [None]:
data_valid = data_preprocessed_no_small_classes[data_preprocessed_no_small_classes.project_id.isin(val_proj_ids)]

In [None]:
# Undersampling most frequent classes
class_size = 800

res = data_train.groupby('vrod_v11_type').apply(lambda x: x.sample(n=min(class_size, len(x))))
res = res.reset_index(drop=True)
print(res['vrod_v11_type'].value_counts())

vrod_v11_type
Cookstoves                                           800
Wind                                                 730
Improved Forest Management                           544
Hydropower                                           381
Afforestation/Reforestation                          340
Manure Methane Digester                              323
Clean Water                                          297
Landfill Methane                                     275
Solar - Centralized                                  228
Rice Emission Reductions                             213
Ozone Depleting Substances Recovery & Destruction    211
Biomass                                              200
REDD+                                                196
Community Boreholes                                  186
Biodigesters                                         175
Sustainable Agriculture                              119
Bundled Energy Efficiency                            112
Mine Methane Capt

In [None]:
# Undersampling lower frequency classes
lst = [res]
for class_index, group in res.groupby('vrod_v11_type'):
    lst.append(group.sample(class_size-len(group), replace=True))
res_new = pd.concat(lst)

In [None]:
data_train = res_new

In [None]:
def generate_input_text_col(df):
  cols = ['project_name',
          'fully_harmonized_methodology_protocol',
          'project_type_from_the_registry',
          'project_developer']
  df['input_text'] = df[cols].apply(lambda row: '. '.join(row.values.astype(str)), axis=1)
  return df

In [None]:
data_train = generate_input_text_col(data_train)
data_valid = generate_input_text_col(data_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['input_text'] = df[cols].apply(lambda row: '. '.join(row.values.astype(str)), axis=1)


In [None]:
MAX_SEQUENCE_LENGTH = 124

# Our experiments involve AUGMENTING / APPENDING on our TRAINING DATA.
# Define a function to process and tokenize training data for re-use throughout experiments.
def process_data(df):

  texts = list(df.input_text)
  labels = np.array(df.type_label)
  encodings = bert_tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, return_tensors='tf')

  return texts, labels, encodings

In [None]:
(train_texts, train_labels, train_encodings) = process_data(data_train)
(valid_texts, valid_labels, valid_encodings) = process_data(data_valid)

In [None]:
categorical_features = ['region', 'voluntary_registry', 'arborwaproject']

In [None]:
X_train = data_train[categorical_features + ['input_text']]
y_train = data_train[['vrod_v11_type']]

In [None]:
# Preprocessor for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Combine all preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
        ],
    remainder='passthrough'
)

# Pipeline for preprocessing and resampling
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [None]:
transformed = pipeline.fit(X_train.drop(columns=['input_text']))



In [None]:
X_train_categoricals_ohe = transformed.transform(X_train)

In [None]:
X_train_categoricals_ohe

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
X_valid = data_valid[categorical_features + ['input_text']]
y_valid = data_valid[['vrod_v11_type']]

In [None]:
X_valid_categoricals_ohe = transformed.transform(X_valid)

In [None]:
X_valid_categoricals_ohe

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
def process_bert_encodings(data, batch_size):
  cumul_array = np.empty(shape=[0, bert_hidden_size])

  for i in range(int(len(data) / batch_size) + 1):
    if i * batch_size == len(data):
      break

    batch_inputs = process_data(data[i * batch_size:min((i+1) * batch_size, len(data))])[2]

    bert_out = bert_model(batch_inputs)[1]

    cumul_array = np.vstack((cumul_array, bert_out.numpy()))

  return cumul_array

In [None]:
batch_size = 80
bert_hidden_size = 768

cumul_array = process_bert_encodings(data_train, batch_size)

In [None]:
X_train_categoricals_ohe.shape

(47200, 15)

In [None]:
cumul_array.shape

(47200, 768)

In [None]:
X_train_final = np.hstack((X_train_categoricals_ohe, cumul_array))

## **Model Summary**

In [None]:
y_train.values.ravel()

array(['Advanced Refrigerants', 'Advanced Refrigerants',
       'Advanced Refrigerants', ..., 'Wind', 'Wind', 'Wind'], dtype=object)

In [None]:
# Define the logistic regression model
logistic_regression = LogisticRegression(max_iter=1000, random_state=42, penalty=None)

# Fit the logistic regression model on the resampled data
logistic_regression.fit(X_train_final, y_train.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
cumul_array_valid = process_bert_encodings(data_valid, batch_size)

In [None]:
X_valid_final = np.hstack((X_valid_categoricals_ohe, cumul_array_valid))

In [None]:
y_pred_t = logistic_regression.predict(X_train_final)

In [None]:
train_report = classification_report(y_train, y_pred_t, output_dict=True)
report_t_df = pd.DataFrame(train_report).transpose()
report_t_df

Unnamed: 0,precision,recall,f1-score,support
Advanced Refrigerants,1.000000,1.000000,1.000000,800.000000
Afforestation/Reforestation,0.983750,0.983750,0.983750,800.000000
Avoided Forest Conversion,1.000000,1.000000,1.000000,800.000000
Avoided Grassland Conversion,1.000000,1.000000,1.000000,800.000000
Bicycles,1.000000,1.000000,1.000000,800.000000
...,...,...,...,...
Wetland Restoration,1.000000,1.000000,1.000000,800.000000
Wind,0.939516,0.873750,0.905440,800.000000
accuracy,0.989343,0.989343,0.989343,0.989343
macro avg,0.989369,0.989343,0.989271,47200.000000


In [None]:
# Make predictions on the val data

y_pred = logistic_regression.predict(X_valid_final)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_valid, y_pred)

0.8218232044198895

In [None]:
report = classification_report(y_valid, y_pred, output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
Advanced Refrigerants,1.0,1.0,1.0,3.0
Afforestation/Reforestation,0.842105,0.842105,0.842105,38.0
Avoided Forest Conversion,1.0,1.0,1.0,1.0
Avoided Grassland Conversion,1.0,1.0,1.0,2.0
Bicycles,0.5,1.0,0.666667,1.0
Biodigesters,0.833333,0.789474,0.810811,19.0
Biomass,0.666667,0.727273,0.695652,22.0
Brick Manufacturing Emission Reductions,0.0,0.0,0.0,1.0
Bundled Energy Efficiency,0.625,0.769231,0.689655,13.0
Carbon-Absorbing Concrete,0.0,0.0,0.0,1.0
