In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# Step 1: Load the dataset from a local CSV file
local_csv_path = '/content/training_dataset_more_detections.csv'
df = pd.read_csv(local_csv_path)

# Step 2: Round the values to 2 decimal points
df['pH'] = df['pH'].round(2)
df['TDS'] = df['TDS'].round(2)

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Step 3: Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 4: Tokenize the dataset
def tokenize_function(examples):
    texts = [f"pH: {p:.2f}, TDS: {t:.2f}, Timestamp: {ts}" for p, t, ts in zip(examples['pH'], examples['TDS'], examples['timestamp'])]
    return tokenizer(texts, truncation=True, padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 5: Convert labels to tensors
def convert_labels(examples):
    examples['labels'] = examples['dumping_detected']
    return examples

tokenized_dataset = tokenized_dataset.map(convert_labels, batched=True)

# Step 6: Specify columns to be used by the Trainer
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 7: Split dataset into training and testing
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 9: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=1e-4,
)

# Step 10: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 11: Train the model
trainer.train()

output_dir = './fine_tuned_water_quality'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
10,0.676,0.631698
20,0.5713,0.559807
30,0.4761,0.54779
40,0.5518,0.530241
50,0.4855,0.558413
60,0.411,0.529826
70,0.485,0.48851


('./fine_tuned_water_quality/tokenizer_config.json',
 './fine_tuned_water_quality/special_tokens_map.json',
 './fine_tuned_water_quality/vocab.txt',
 './fine_tuned_water_quality/added_tokens.json')

In [None]:
results = trainer.evaluate()

print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.3115279972553253, 'eval_runtime': 1.3629, 'eval_samples_per_second': 35.219, 'eval_steps_per_second': 4.402, 'epoch': 3.0}


In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_water_quality'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Load input data from a CSV file
input_csv_path = '/content/testing_dataset_more_detections.csv'
df = pd.read_csv(input_csv_path)

# Prepare the input texts from the CSV file
input_texts = df.apply(lambda row: f"pH: {row['pH']:.2f}, TDS: {row['TDS']:.2f}, Timestamp: {row['timestamp']}", axis=1).tolist()

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to tokenize a single text
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")

# Function to predict dumping detection for a single input text
def predict_dumping_detection_single(text):
    model.eval()
    with torch.no_grad():
        inputs = tokenize_function(text)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_probabilities = torch.softmax(logits, dim=-1)
        predicted_label = torch.argmax(predicted_probabilities, dim=-1)
        return predicted_label.item()

# Iterate through the input texts and make predictions
predictions = [predict_dumping_detection_single(text) for text in input_texts]

# Add predictions to the DataFrame
df['predicted_dumping_detected'] = predictions

# Print the results with both predicted and true dumping detection values
print(df[['pH', 'TDS', 'timestamp', 'dumping_detected', 'predicted_dumping_detected']])


       pH     TDS  timestamp  dumping_detected  predicted_dumping_detected
0    7.25  308.48          0                 0                           0
1    6.92  300.79         30                 0                           0
2    7.21  271.28         60                 0                           0
3    7.12  273.68         90                 0                           0
4    8.75  393.05        120                 1                           0
..    ...     ...        ...               ...                         ...
115  8.79  375.01       3450                 1                           0
116  6.97  286.60       3480                 0                           0
117  7.06  323.43       3510                 0                           0
118  6.90  276.48       3540                 0                           0
119  7.27  309.75       3570                 0                           0

[120 rows x 5 columns]


In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_water_quality'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Load input data from a CSV file
input_csv_path = '/content/testing_dataset_more_detections.csv'
df = pd.read_csv(input_csv_path)

# Prepare the input texts from the CSV file
input_texts = df.apply(lambda row: f"pH: {row['pH']:.2f}, TDS: {row['TDS']:.2f}, Timestamp: {row['timestamp']}", axis=1).tolist()

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to tokenize a single text
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")

# Function to predict dumping detection for a single input text
def predict_dumping_detection_single(text):
    model.eval()
    with torch.no_grad():
        inputs = tokenize_function(text)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_probabilities = torch.softmax(logits, dim=-1)
        predicted_label = torch.argmax(predicted_probabilities, dim=-1)
        return predicted_label.item()

# Iterate through the input texts and make predictions
predictions = [predict_dumping_detection_single(text) for text in input_texts]

# Add predictions to the DataFrame
df['predicted_dumping_detected'] = predictions

# Print the results with both predicted and true dumping detection values
print(df[['pH', 'TDS', 'timestamp', 'dumping_detected', ]])

# Write the results to a CSV file
output_csv_path = './results.csv'
df.to_csv(output_csv_path, index=False)



       pH     TDS  timestamp  dumping_detected
0    7.25  308.48          0                 0
1    6.92  300.79         30                 0
2    7.21  271.28         60                 0
3    7.12  273.68         90                 0
4    8.75  393.05        120                 1
..    ...     ...        ...               ...
115  8.79  375.01       3450                 1
116  6.97  286.60       3480                 0
117  7.06  323.43       3510                 0
118  6.90  276.48       3540                 0
119  7.27  309.75       3570                 0

[120 rows x 4 columns]
