In [2]:
import torch
from transformers import BitsAndBytesConfig,AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from accelerate import Accelerator
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
#!pip install pyarrow --upgrade

In [4]:
#!pip install accelerate -U # upgrade to latest version

In [5]:
# Function to load data from text files and assign labels
def load_text_files(file_paths, label):
    texts = []
    labels = []
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            texts.extend([line.strip() for line in lines])
            labels.extend([label] * len(lines))  # Assign label to all lines in the file
    return texts, labels

# File paths for false positive (label 0) and true positive (label 1) training datasets
false_positive_train_files = [
    "bc_data/processed_data_false_positive_train_0.txt",
    "bc_data/processed_data_false_positive_train_1.txt",
    "bc_data/processed_data_false_positive_train_2.txt",
    "bc_data/processed_data_false_positive_train_3.txt"
]

true_positive_train_files = [
    "bc_data/processed_data_true_positive_train_0.txt",
    "bc_data/processed_data_true_positive_train_1.txt",
    "bc_data/processed_data_true_positive_train_2.txt",
    "bc_data/processed_data_true_positive_train_3.txt"
]

# Load the training datasets
false_train_texts, false_train_labels = load_text_files(false_positive_train_files, label=0)
true_train_texts, true_train_labels = load_text_files(true_positive_train_files, label=1)

# Combine both training datasets
train_texts = false_train_texts + true_train_texts
train_labels = false_train_labels + true_train_labels

# Split into train/test sets (if needed for validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert training data to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# File paths for false positive (label 0) and true positive (label 1) test datasets
false_positive_test_files = ["bc_data/processed_data_false_positive_test_4.txt"]
true_positive_test_files = ["bc_data/processed_data_true_positive_test_4.txt"]

# Load the test datasets
false_test_texts, false_test_labels = load_text_files(false_positive_test_files, label=0)
true_test_texts, true_test_labels = load_text_files(true_positive_test_files, label=1)

# Combine both test datasets
test_texts = false_test_texts + true_test_texts
test_labels = false_test_labels + true_test_labels

# Convert test data to Hugging Face dataset format
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})



In [6]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType

# Define the tokenizer and model paths
model_name = "mistralai/Mistral-7B-v0.1"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

# Load the model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    device_map="auto"  # Automatically distribute across GPUs
)

# Use LoRA for efficient fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,  # We are in training mode
    r=8,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1  # Dropout rate
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()

# Define the preprocessing function for tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)



# Apply the preprocessing function to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set the training arguments (no fp16 here to avoid the conflict)
training_args = TrainingArguments(
    output_dir="results_Mistral",
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=100,
    per_device_train_batch_size=1,  # Adjust to memory constraints
    per_device_eval_batch_size=1,
    num_train_epochs=3,  # Modify based on needs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    dataloader_num_workers=4,
    report_to="none",
    fp16=True,  # No fp16 used
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# # Save the model after training
# model.save_pretrained("./mistral_lora_no_fp16")

# # Perform inference on the test dataset
# predictions = trainer.predict(test_dataset)

# # Convert predictions to labels
# predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# # Print some test samples with their predictions
# for text, true_label, predicted_label in zip(test_dataset['text'][:5], test_dataset['label'][:5], predicted_labels[:5]):
#     print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/776810 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/194203 [00:00<?, ? examples/s]

Map:   0%|          | 0/59568 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
100,1.338,1.56839
200,1.2513,1.378005
300,1.5648,1.27468
400,1.3307,1.206921
500,1.2873,1.150325
600,0.8805,1.146395
700,0.9107,1.109214
800,1.3011,1.021644
900,1.9799,1.167618
1000,0.9206,0.965589




Step,Training Loss,Validation Loss
100,1.338,1.56839
200,1.2513,1.378005
300,1.5648,1.27468
400,1.3307,1.206921
500,1.2873,1.150325
600,0.8805,1.146395
700,0.9107,1.109214
800,1.3011,1.021644
900,1.9799,1.167618
1000,0.9206,0.965589




KeyboardInterrupt: 

In [None]:
#!pip install --upgrade transformers datasets peft bitsandbytes accelerate


In [None]:
#!pip install -U bitsandbytes

In [7]:
# Save the trained model
#model.save_pretrained("./mistral_binary_classifier")

# Perform inference on the test dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Print some test samples with their predictions
for text, true_label, predicted_label in zip(test_texts[:10], test_labels[:10], predicted_labels[:10]):
    print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")

Step,Training Loss,Validation Loss
100,1.338,1.56839
200,1.2513,1.378005
300,1.5648,1.27468
400,1.3307,1.206921
500,1.2873,1.150325
600,0.8805,1.146395
700,0.9107,1.109214
800,1.3011,1.021644
900,1.9799,1.167618
1000,0.9206,0.965589


Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 1

Text: Year: 2000-2003	Make: Volvo	Model: S40
True Label: 0, Predicted Label: 1

Text: Year: 2000-2004	Make: Volvo	Model: V40
True Label: 0, Predicted Label: 1

Text: Year: 2001	Make: Suzuki	Model: Grand Vitara
True Label: 0, Predicted Label: 1

Text: Year: 2002-2006	Make: Suzuki	Model: XL-7
True Label: 0, Predicted Label: 1



#checkpoint 2000

In [12]:
total_One=0
total_zero=0
correct_zero=0
correct_one = 0
total=0
for text, true_label, predicted_label in zip(test_texts, test_labels, predicted_labels):
    #print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")
    if true_label==0:
        total_zero+=1
    elif true_label==1:
        total_One+=1
    if true_label==0 and true_label==predicted_label:
        correct_zero+=1
    elif true_label==1 and true_label==predicted_label:
        correct_one+=1
    total+=1

print(total)
print("Accuracy",(correct_zero+correct_one)*100/total)
print("Class 1 Accuracy",correct_one*100/total_One)
print("Class 0 accuracy",correct_zero*100/total_zero)

59568
Accuracy 71.64249261348375
Class 1 Accuracy 88.16351433801098
Class 0 accuracy 23.904668625530526


#Checkpoint 1500

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    device_map="auto"  # Automatically distribute across GPUs
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from peft import PeftModel

In [15]:
model = PeftModel.from_pretrained(
            model,
            "results_Mistral/checkpoint-1500",
            torch_dtype=torch.float16,
        )

In [16]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [17]:
# Save the trained model
#model.save_pretrained("./mistral_binary_classifier")

# Perform inference on the test dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Print some test samples with their predictions
for text, true_label, predicted_label in zip(test_texts[:10], test_labels[:10], predicted_labels[:10]):
    print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")

Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2015-2016	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2019	Make: Acura	Model: TLX
True Label: 0, Predicted Label: 0

Text: Year: 2000-2003	Make: Volvo	Model: S40
True Label: 0, Predicted Label: 1

Text: Year: 2000-2004	Make: Volvo	Model: V40
True Label: 0, Predicted Label: 1

Text: Year: 2001	Make: Suzuki	Model: Grand Vitara
True Label: 0, Predicted Label: 1

Text: Year: 2002-2006	Make: Suzuki	Model: XL-7
True Label: 0, Predicted Label: 1



In [18]:
total_One=0
total_zero=0
correct_zero=0
correct_one = 0
total=0
for text, true_label, predicted_label in zip(test_texts, test_labels, predicted_labels):
    #print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")
    if true_label==0:
        total_zero+=1
    elif true_label==1:
        total_One+=1
    if true_label==0 and true_label==predicted_label:
        correct_zero+=1
    elif true_label==1 and true_label==predicted_label:
        correct_one+=1
    total+=1

print(total)
print("Accuracy",(correct_zero+correct_one)*100/total)
print("Class 1 Accuracy",correct_one*100/total_One)
print("Class 0 accuracy",correct_zero*100/total_zero)

59568
Accuracy 67.01752618855761
Class 1 Accuracy 76.86936478882788
Class 0 accuracy 38.55044074436827


#Checkpoint 1000

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    device_map="auto"  # Automatically distribute across GPUs
)

model = PeftModel.from_pretrained(
            model,
            "results_Mistral/checkpoint-1000",
            torch_dtype=torch.float16,
        )

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Perform inference on the test dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
total_One=0
total_zero=0
correct_zero=0
correct_one = 0
total=0
for text, true_label, predicted_label in zip(test_texts, test_labels, predicted_labels):
    #print(f"Text: {text}\nTrue Label: {true_label}, Predicted Label: {predicted_label}\n")
    if true_label==0:
        total_zero+=1
    elif true_label==1:
        total_One+=1
    if true_label==0 and true_label==predicted_label:
        correct_zero+=1
    elif true_label==1 and true_label==predicted_label:
        correct_one+=1
    total+=1

print(total)
print("Accuracy",(correct_zero+correct_one)*100/total)
print("Class 1 Accuracy",correct_one*100/total_One)
print("Class 0 accuracy",correct_zero*100/total_zero)

59568
Accuracy 70.01745903840988
Class 1 Accuracy 85.67328768671051
Class 0 accuracy 24.779627815866796
