In [1]:
!pip install transformers --quiet

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv("../data/train_dataset.csv")

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7269 entries, 0 to 7268
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   project_name                    7269 non-null   object
 1   methodology_or_protocol         7269 non-null   object
 2   region                          7269 non-null   object
 3   voluntary_registry              7269 non-null   object
 4   project_type_from_the_registry  7269 non-null   object
 5   project_developer               7269 non-null   object
 6   arborwaproject                  7269 non-null   object
 7   scope                           7269 non-null   object
 8   type                            7269 non-null   object
dtypes: object(9)
memory usage: 511.2+ KB


In [5]:
# Get unique count of values in each column
unique_counts = train_df.apply(lambda x: x.nunique())
print("\nUnique Value Counts:")
print(unique_counts)


Unique Value Counts:
project_name                      7210
methodology_or_protocol            359
region                              13
voluntary_registry                   4
project_type_from_the_registry     106
project_developer                 2572
arborwaproject                       4
scope                                9
type                                78
dtype: int64


In [6]:
type_df = train_df["type"]
type_counts = type_df.value_counts().reset_index()
type_counts.columns = ["type", 'count']

type_counts

Unnamed: 0,type,count
0,Cookstoves,1166
1,Wind,831
2,Improved Forest Management,597
3,Hydropower,419
4,Afforestation/Reforestation,374
...,...,...
73,Improved irrigation management,1
74,Pneumatic Retrofit,1
75,Compost Addition to Rangeland,1
76,Grid Expansion & Mini-Grids,1


In [7]:
# Projects with types lower than 10 counts
type_counts = train_df["type"].value_counts()

# Identify the types with fewer than 10 counts
types_to_drop = type_counts[type_counts < 10].index
print(types_to_drop)
# Drop the records with these types from train_df
train_df_new = train_df[~train_df["type"].isin(types_to_drop)]
print(train_df_new.shape)

Index(['Shipping', 'Leak Detection & Repair in Gas Systems',
       'Solid Waste Separation', 'Feed Additives', 'Mass Transit',
       'University Campus Emission Reductions', 'Fleet Efficiency',
       'Plugging Oil & Gas Wells', 'Carbon-Absorbing Concrete', 'Biochar',
       'Bicycles', 'Carbon Capture in Concrete',
       'Road Construction Emission Reductions',
       'Carbon Capture & Enhanced Oil Recovery', 'Fuel Transport',
       'Nitrogen Management', 'SF6 Replacement', 'Weatherization',
       'N2O Destruction in Adipic Acid Production', 'Oil Recycling',
       'Aluminum Smelters Emission Reductions', 'HFC23 Destruction',
       'Lower Carbon Cement & Concrete',
       'Bundled Compost Production and Soil Application', 'Waste Reduction',
       'Refrigerant Leak Detection', 'Propylene Oxide Production',
       'Improved irrigation management', 'Pneumatic Retrofit',
       'Compost Addition to Rangeland', 'Grid Expansion & Mini-Grids',
       'Carbon Capture in Plastic'],
    

In [8]:
#Categorical and Text Features

categorical_features = ['region', 'voluntary_registry', 'arborwaproject']
text_features = ['project_name','methodology_or_protocol','project_type_from_the_registry', 'project_developer']


In [9]:
# Define function for concatenating text features
def concatenate_text_features(df, text_features):
    return [' '.join(str(row[feature]) for feature in text_features if pd.notnull(row[feature])) for _, row in df.iterrows()]


In [10]:
train_df_new["text_features"] = concatenate_text_features(train_df_new, text_features)
train_df_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_new["text_features"] = concatenate_text_features(train_df_new, text_features)


Unnamed: 0,project_name,methodology_or_protocol,region,voluntary_registry,project_type_from_the_registry,project_developer,arborwaproject,scope,type,text_features
0,YESIL HYDROELECTRIC POWER PLANT (HEPP),ACM0002,Western Asia,VCS,Energy industries (renewable/non-renewable sou...,Yeşilbas Elektrik Üretim AŞ,No,Renewable Energy,Hydropower,YESIL HYDROELECTRIC POWER PLANT (HEPP) ACM0002...
1,GS5047 VPA31 African Improved Cookstoves and C...,GS TPDDTEC v3.1,Sub-Saharan Africa,GOLD,Energy Efficiency - Domestic,Likano Project Development GmbH,No,Household & Community,Community Boreholes,GS5047 VPA31 African Improved Cookstoves and C...
2,50 MW (DCR) Nalgonda Solar PV Power Project by...,ACM0002 Grid-connected electricity generation ...,Southern Asia,GOLD,Solar Thermal - Electricity,Infinite Environmental Solutions LLP,No,Renewable Energy,Solar - Centralized,50 MW (DCR) Nalgonda Solar PV Power Project by...
3,DENIZLI WPP,ACM0002 Grid-connected electricity generation ...,Western Asia,GOLD,Wind,KORDA ENERJI RETIM PAZARLAMA ITHALAT VE IHRACA...,No,Renewable Energy,Wind,DENIZLI WPP ACM0002 Grid-connected electricity...
4,Wind Based Power Generation by Rajasthan Gum P...,ACM0002,Southern Asia,VCS,Energy industries (renewable/non-renewable sou...,Rajasthan Gum Private Limited,No,Renewable Energy,Wind,Wind Based Power Generation by Rajasthan Gum P...


In [11]:
## Splitting the train to train and validation sets
X_train_df = train_df_new.drop(columns=['type'])
Y_train_df = train_df_new['type']
X_train, X_val, Y_train, Y_val  = train_test_split(X_train_df,Y_train_df, test_size=0.1, stratify =Y_train_df,  random_state=42)
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape )

(6433, 9) (715, 9) (6433,) (715,)


In [12]:
# Function to encode labels, handling unseen labels with a default category
def encode_labels(label_encoder, labels):
    encoded_labels = []
    for label in labels:
        if label in label_encoder.classes_:
            encoded_labels.append(label_encoder.transform([label])[0])
        else:
            encoded_labels.append(len(label_encoder.classes_))  # Example: len(label_encoder.classes_) represents the default category
    return encoded_labels

label_encoder = LabelEncoder()
label_encoder.fit(Y_train)
Y_train_encoded = np.asarray(encode_labels(label_encoder, Y_train))
Y_val_encoded = np.asarray(encode_labels(label_encoder, Y_val))

print(Y_train_encoded[:5])

[45  8 19 19 11]


In [13]:
import tensorflow as tf

class F5Score(tf.keras.metrics.Metric):
    def __init__(self, name='f5_score', **kwargs):
        super(F5Score, self).__init__(name=name, **kwargs)
        self.tp = self.add_weight(name='tp', initializer='zeros')
        self.fp = self.add_weight(name='fp', initializer='zeros')
        self.fn = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.argmax(y_pred, axis=1)
        
        y_true = tf.cast(y_true, tf.int32)
        y_pred = tf.cast(y_pred, tf.int32)

        tp = tf.reduce_sum(tf.cast(tf.equal(y_true, y_pred), tf.float32))
        fp = tf.reduce_sum(tf.cast(tf.not_equal(y_true, y_pred), tf.float32))
        fn = tf.reduce_sum(tf.cast(tf.not_equal(y_pred, y_true), tf.float32))

        self.tp.assign_add(tp)
        self.fp.assign_add(fp)
        self.fn.assign_add(fn)

    def result(self):
        precision = self.tp / (self.tp + self.fp + tf.keras.backend.epsilon())
        recall = self.tp / (self.tp + self.fn + tf.keras.backend.epsilon())
        return 5 * (precision * recall) / (4 * precision + recall + tf.keras.backend.epsilon())

    def reset_states(self):
        self.tp.assign(0)
        self.fp.assign(0)
        self.fn.assign(0)

### Classification with a fine tuned BERT model

In [14]:
from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

In [15]:
#make it easier to use a variety of BERT subword models
model_checkpoint = 'bert-base-cased'

In [16]:
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

In [17]:
# tokenize the dataset, truncate at `max_length`,
# and pad with 0's when less than `max_length` and return a tf Tensor
max_length = 200
X_train_text_features = X_train["text_features"].astype(str).tolist()
Y_train_text_features = X_val["text_features"].astype(str).tolist()
train_encodings = bert_tokenizer(X_train_text_features, truncation=True, padding="max_length", max_length=max_length, return_tensors='tf')
valid_encodings = bert_tokenizer(Y_train_text_features, truncation=True, padding="max_length", max_length=max_length, return_tensors='tf')


In [18]:
train_encodings.input_ids[:1]

<tf.Tensor: shape=(1, 200), dtype=int32, numpy=
array([[  101, 23396, 26978, 26580,  1513,  5514,  9690,  2107,  7629,
         1568,  1477,  5514,  7519,   113, 17216,   120,  1664,   118,
        17216,  3509,   114,  4149,  1389,  2599,   144,  5970,  2599,
         1186,  1260, 13832,  1200,  9037,   156,   119,   138,   119,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [19]:
print(train_encodings.input_ids.shape)
print(valid_encodings.input_ids.shape)

(6433, 200)
(715, 200)


In [20]:
def create_bert_multiclass_model(checkpoint = model_checkpoint,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """
    bert_model = TFBertModel.from_pretrained(checkpoint)
    #bert_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)

    pooler_token = bert_out[1]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooler_token)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(num_classes, activation='softmax',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.sparse_categorical_crossentropy,
                                 metrics=['accuracy'])

    classification_model.summary()
    ### END YOUR CODE
    return classification_model

num_classes = len(label_encoder.classes_)
    

In [21]:
pooler_bert_model = create_bert_multiclass_model(checkpoint=model_checkpoint,
                                                 num_classes=num_classes)




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 200)]       0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 200)]        0           []                               
                                                                                                  
 token_type_ids_layer (InputLay  [(None, 200)]       0           []                               
 er)                                                                                              
                                                                                              

In [22]:
print(Y_train_encoded.shape)
print(Y_val_encoded.shape)

(6433,)
(715,)


In [23]:
pooler_bert_model_history = pooler_bert_model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask],
    Y_train_encoded,
    validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], Y_val_encoded),
    batch_size=32,
    epochs=1
)

2024-06-30 19:46:00.818974: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [25]:
pooler_bert_model.save("../data/models/bert_acc_v1.h5")

In [None]:
score = pooler_bert_model.evaluate([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask],
                                                  Y_val_encoded)


In [27]:
print('Val loss:', score[0])
print('Val accuracy:', score[1])

Val loss: 0.5179084539413452
Val accuracy: 0.8615384697914124
