In [None]:
! pip install -U accelerate
! pip install -U transformers


In [3]:
import pandas as pd

data = {
    'text': [
        "The physical system is formed by two main sub-networks: inside the SVT Analytics premises and inside any client premises.The first one is where the developers have their workstations to work. There is a server and a database for development purposes and a server and a database for testing. There’s also a bunch of cameras and a server for computing the headcount on those cameras. This server fills both the development and testing databases.The workstations are connected to the internet through a router behind a firewall, for security concerns. Inside the company premises there’s the same entry configuration.However, the only machine that can be accessed is the web server. The database is filled by both the web server and the computer vision system. The cameras send a live stream to the computer vision system.All the network connections are wired, except the surveillance cameras’ one, which is wireless. If it’s not feasible to have separate network connections between servers that don’t need to be connected, it’s possible to join them in a single network but the server should be in a separate one that has access to the Internet. Inside the SVT Analytics ideally only the workstations should have access to the Internet. The current status, however, is another one inside the SVT Analytics premises: the development server and database, and the computer vision developing system are all inside the single development workstation that we have right now. Our testing server and database, and the computer vision testing system are outside the premises, inside the InReality company premises. Combining systems into a single computer is not recommended, mainly for performance issues.",
        "The system uses synchronous communication between servers without any messaging components.",
        "Asynchronous messaging is used to ensure senders and receivers are decoupled.",
        "When a data recieving event occurs on the camera it is processed through AWS Lamdba, instead of provisioning a server to do so.",
        "The system uses Azure Functions to dynamically allocate resources and process incoming inputs."
    ],
    'decoupled_messaging': [0, 0, 1, 0, 0],  # 1 indicates adherence, 0 indicates non-adherence
    'serverless_solution': [0, 0, 0, 1, 1]  # Example for another practice
}

df = pd.DataFrame(data)
df.to_csv('multi_label_green_software_practice_dataset.csv', index=False)


In [4]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

# Load the dataset
df = pd.read_csv('multi_label_green_software_practice_dataset.csv')

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df[['decoupled_messaging', 'serverless_solution']].values.tolist(), test_size=0.2) #test_size=0.2 so it aims to put 20% of the dataset into the validation set

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")

# Convert labels to tensors
train_labels = torch.tensor(train_labels, dtype=torch.float32)
val_labels = torch.tensor(val_labels, dtype=torch.float32)

# Load the model and adjust the output layer for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define custom Trainer class to use Binary Cross Entropy Loss
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define custom dataset class
class GreenSoftwareDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = GreenSoftwareDataset(train_encodings, train_labels)
val_dataset = GreenSoftwareDataset(val_encodings, val_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    evaluation_strategy="epoch"
)

# Create Trainer instance
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()
# After training is complete
trainer.save_model("./results")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6493,0.760926
2,0.6572,0.761237
3,0.6791,0.761841


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained('./results')

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_practices(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Perform inference
    outputs = model(**inputs)

    # Apply sigmoid activation to get probabilities
    probabilities = torch.sigmoid(outputs.logits)

    # Extract probabilities for each practice
    decoupled_messaging_prob = probabilities[0][0].item()
    serverless_solution_prob = probabilities[0][1].item()

    return decoupled_messaging_prob, serverless_solution_prob

# Example usage
text = "When the camera receives input data AWS Lambda is used to process the data instead of using a server to do so."
decoupled_messaging_prob, serverless_solution_prob = predict_practices(text)
print("Probability of decoupled messaging practice:", decoupled_messaging_prob)
print("Probability of serverless solution practice:", serverless_solution_prob)



Probability of decoupled messaging practice: 0.5970698595046997
Probability of serverless solution practice: 0.5288074612617493
