# **Installs**

In [None]:
!pip install tensorflow-datasets==4.8 --quiet
!pip install tensorflow==2.15.0 --quiet #15 13
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-text==2.15.0 --quiet #15
!pip install transformers==4.17 --quiet
!pip install pydot --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Imports**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

In [None]:
from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
import pandas as pd

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# **Data Load**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = '/content/drive/MyDrive/capstone_modeling/data'

In [None]:
model_name =  'bert-base-cased'

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name, output_hidden_states = True)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/capstone_modeling/data_train_oversampled_kh.csv')
data_valid = pd.read_csv('/content/drive/MyDrive/capstone_modeling/data_valid_kh.csv')

  data_train = pd.read_csv('/content/drive/MyDrive/capstone_modeling/data_train_oversampled_kh.csv')


In [None]:
MAX_SEQUENCE_LENGTH = 124

# Our experiments involve AUGMENTING / APPENDING on our TRAINING DATA.
# Define a function to process and tokenize training data for re-use throughout experiments.
def process_data(df):

  texts = list(df.input_text)
  labels = np.array(df.type_label)
  encodings = bert_tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, return_tensors='tf')

  return texts, labels, encodings

In [None]:
(train_texts, train_labels, train_encodings) = process_data(data_train)
(valid_texts, valid_labels, valid_encodings) = process_data(data_valid)

In [None]:
categorical_features = ['region', 'voluntary_registry', 'arborwaproject']

In [None]:
X_train = data_train[categorical_features + ['input_text']]
y_train = data_train[['vrod_v11_type']]

In [None]:
# Preprocessor for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Combine all preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
        ],
    remainder='passthrough'
)

# Pipeline for preprocessing and resampling
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [None]:
transformed = pipeline.fit(X_train.drop(columns=['input_text']))



In [None]:
X_train_categoricals_ohe = transformed.transform(X_train)

In [None]:
X_train_categoricals_ohe

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [None]:
X_valid = data_valid[categorical_features + ['input_text']]
y_valid = data_valid[['vrod_v11_type']]

In [None]:
X_valid_categoricals_ohe = transformed.transform(X_valid)

In [None]:
X_valid_categoricals_ohe

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [None]:
batch_size = 160
bert_hidden_size = 768

cumul_array = np.empty(shape=[0, bert_hidden_size])

for i in range(int(len(data_train) / batch_size) + 1):
  if i * batch_size == len(data_train):
    break

  batch_inputs = process_data(data_train[i * batch_size:min((i+1) * batch_size, len(data_train))])[2]

  bert_out = bert_model(batch_inputs)[1]

  cumul_array = np.vstack((cumul_array, bert_out.numpy()))

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [None]:
X_train_categoricals_ohe.shape

(36800, 15)

In [None]:
cumul_array.shape

(36800, 768)

In [None]:
X_train_final = np.hstack((X_train_categoricals_ohe, cumul_array))

## **Model Summary**

In [None]:
y_train.values.ravel()

array(['Advanced Refrigerants', 'Advanced Refrigerants',
       'Advanced Refrigerants', ..., 'Wind', 'Wind', 'Wind'], dtype=object)

In [None]:
# Define the logistic regression model
logistic_regression = LogisticRegression(max_iter=1000, random_state=42, penalty=None)

# Fit the logistic regression model on the resampled data
logistic_regression.fit(X_train_final, y_train.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def process_bert_encodings(data, batch_size):
  cumul_array = np.empty(shape=[0, bert_hidden_size])

  for i in range(int(len(data) / batch_size) + 1):
    if i * batch_size == len(data):
      break

    batch_inputs = process_data(data[i * batch_size:min((i+1) * batch_size, len(data))])[2]

    bert_out = bert_model(batch_inputs)[1]

    cumul_array = np.vstack((cumul_array, bert_out.numpy()))

  return cumul_array

In [None]:
cumul_array_valid = process_bert_encodings(data_valid, 160)

In [None]:
X_valid_final = np.hstack((X_valid_categoricals_ohe, cumul_array_valid))

In [None]:
y_pred_t = logistic_regression.predict(X_train_final)

In [None]:
train_report = classification_report(y_train, y_pred_t, output_dict=True)
report_t_df = pd.DataFrame(train_report).transpose()
report_t_df

Unnamed: 0,precision,recall,f1-score,support
Advanced Refrigerants,1.0,1.0,1.0,800.0
Afforestation/Reforestation,0.974194,0.94375,0.95873,800.0
Avoided Forest Conversion,1.0,1.0,1.0,800.0
Avoided Grassland Conversion,1.0,1.0,1.0,800.0
Biodigesters,0.98875,0.98875,0.98875,800.0
Biomass,0.902439,0.925,0.91358,800.0
Brick Manufacturing Emission Reductions,1.0,1.0,1.0,800.0
Bundled Energy Efficiency,0.943442,0.98,0.961373,800.0
Clean Water,0.968952,0.93625,0.95232,800.0
Community Boreholes,0.935252,0.975,0.954712,800.0


In [None]:
# Make predictions on the val data

y_pred = logistic_regression.predict(X_valid_final)

In [None]:
report = classification_report(y_valid, y_pred, output_dict=True)

In [None]:
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
Advanced Refrigerants,1.0,1.0,1.0,3.0
Afforestation/Reforestation,0.810811,0.810811,0.810811,37.0
Avoided Forest Conversion,1.0,1.0,1.0,1.0
Avoided Grassland Conversion,1.0,1.0,1.0,2.0
Biodigesters,0.9,0.9,0.9,20.0
Biomass,0.608696,0.636364,0.622222,22.0
Brick Manufacturing Emission Reductions,0.333333,1.0,0.5,1.0
Bundled Energy Efficiency,0.846154,0.785714,0.814815,14.0
Clean Water,0.888889,0.969697,0.927536,33.0
Community Boreholes,0.833333,0.952381,0.888889,21.0
