In [None]:
import pandas as pd

# Function to load text file into a list (excluding empty lines)
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file if line.strip()]
    return lines

# File paths for positive, negative, and question text files
pos_file_path = '/content/Pos_feed_int.txt'
neg_file_path = '/content/Neg_feed_int.txt'
ques_file_path = '/content/ques_feed_int.txt'

# Load text files into lists
pos_list = load_text_file(pos_file_path)
neg_list = load_text_file(neg_file_path)
ques_list = load_text_file(ques_file_path)

# Concatenate lists and create labels
all_lists = pos_list + neg_list + ques_list
labels = ['pos'] * len(pos_list) + ['neg'] * len(neg_list) + ['ques'] * len(ques_list)

# Create a DataFrame with 'text' and 'label' columns
data = {'text': all_lists, 'label': labels}
df = pd.DataFrame(data)

# Display the DataFrame
df

Unnamed: 0,text,label
0,Thank you for the thorough explanation; it has...,pos
1,"I appreciate your prompt and helpful response,...",pos
2,Your assistance has been invaluable in resolvi...,pos
3,This guidance is exceptionally helpful and pre...,pos
4,I'm grateful for your expertise; your insights...,pos
...,...,...
1914,Describe the principles of container orchestra...,ques
1915,What strategies can be employed for effective ...,ques
1916,Discuss the principles of explainable AI (XAI)...,ques
1917,Explore the applications of robotics process a...,ques


In [None]:
df['label'].unique()

array(['pos', 'neg', 'ques'], dtype=object)

In [None]:
df['label'] = df['label'].astype('category')

In [None]:
df['label'].unique()

['pos', 'neg', 'ques']
Categories (3, object): ['neg', 'pos', 'ques']

In [None]:
df['label'].value_counts()

ques    904
pos     515
neg     500
Name: label, dtype: int64

In [None]:
# Reduce the number of rows for 'ques' label to approximately 500-515
ques_sampled = df[df['label'] == 'ques'].sample(n=510, random_state=42)  # Adjust the 'n' value as needed
df = pd.concat([df[df['label'] != 'ques'], ques_sampled])

# Display the updated DataFrame
df

Unnamed: 0,text,label
0,Thank you for the thorough explanation; it has...,pos
1,"I appreciate your prompt and helpful response,...",pos
2,Your assistance has been invaluable in resolvi...,pos
3,This guidance is exceptionally helpful and pre...,pos
4,I'm grateful for your expertise; your insights...,pos
...,...,...
1122,Explain the principles of intrusion detection ...,ques
1894,Describe the principles of low-power design in...,ques
1074,Discuss the importance of ergonomics in produc...,ques
1680,Discuss the role of quantum machine learning i...,ques


In [None]:
df['label'].value_counts()

pos     515
ques    510
neg     500
Name: label, dtype: int64

In [None]:
df['encoded_label'] = df['label'].cat.codes

In [None]:
df

Unnamed: 0,text,label,encoded_label
0,Thank you for the thorough explanation; it has...,pos,1
1,"I appreciate your prompt and helpful response,...",pos,1
2,Your assistance has been invaluable in resolvi...,pos,1
3,This guidance is exceptionally helpful and pre...,pos,1
4,I'm grateful for your expertise; your insights...,pos,1
...,...,...,...
1122,Explain the principles of intrusion detection ...,ques,2
1894,Describe the principles of low-power design in...,ques,2
1074,Discuss the importance of ergonomics in produc...,ques,2
1680,Discuss the role of quantum machine learning i...,ques,2


In [None]:
df['encoded_label'].value_counts()

1    515
2    510
0    500
Name: encoded_label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

# Assuming df has 'text', 'label', and 'encoded_label' columns

# Define the proportions for train, test, and val
train_ratio = 0.7
test_ratio = 0.2
val_ratio = 0.1

# Stratified split for each class
train_df, temp_df = train_test_split(df, test_size=1 - train_ratio, stratify=df['encoded_label'], random_state=42)

# Further split temp_df into test and val
test_df, val_df = train_test_split(temp_df, test_size=val_ratio / (test_ratio + val_ratio), stratify=temp_df['encoded_label'], random_state=42)

# Display the sizes of train, test, and val sets for each class
print("Train class distribution:")
print(train_df['encoded_label'].value_counts())
print("\nTest class distribution:")
print(test_df['encoded_label'].value_counts())
print("\nVal class distribution:")
print(val_df['encoded_label'].value_counts())


Train class distribution:
1    360
2    357
0    350
Name: encoded_label, dtype: int64

Test class distribution:
1    103
2    102
0    100
Name: encoded_label, dtype: int64

Val class distribution:
1    52
2    51
0    50
Name: encoded_label, dtype: int64


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the data
# (Assuming you've already split the data into train_df, test_df, and val_df)

# Tokenizer and model initialization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['encoded_label'].unique()))

# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

# Tokenize and prepare input data
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

def collate_fn(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': torch.tensor(labels)}

# Split the data into train, test, and validation sets
train_dataset = CustomDataset(train_df['text'].tolist(), train_df['encoded_label'].tolist())
test_dataset = CustomDataset(test_df['text'].tolist(), test_df['encoded_label'].tolist())
val_dataset = CustomDataset(val_df['text'].tolist(), val_df['encoded_label'].tolist())

# DataLoader for efficient training
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation {epoch + 1}/{num_epochs}'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}, Validation Accuracy: {accuracy}')

# Testing
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {test_accuracy}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 134/134 [00:07<00:00, 18.22it/s]
Validation 1/3: 100%|██████████| 20/20 [00:00<00:00, 76.44it/s]


Epoch 1/3, Train Loss: 0.34039023838270066, Validation Accuracy: 1.0


Epoch 2/3: 100%|██████████| 134/134 [00:05<00:00, 23.07it/s]
Validation 2/3: 100%|██████████| 20/20 [00:00<00:00, 95.74it/s]


Epoch 2/3, Train Loss: 0.02572011385363207, Validation Accuracy: 1.0


Epoch 3/3: 100%|██████████| 134/134 [00:06<00:00, 21.41it/s]
Validation 3/3: 100%|██████████| 20/20 [00:00<00:00, 98.81it/s]


Epoch 3/3, Train Loss: 0.01070969463534542, Validation Accuracy: 1.0


Testing: 100%|██████████| 39/39 [00:00<00:00, 96.06it/s]

Test Accuracy: 1.0





In [None]:
import joblib

# Save the trained model
model_path = 'distilbert_classifier'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
label_encoder_path = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)

['label_encoder.pkl']

In [None]:
# Load the saved model
loaded_model = DistilBertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = DistilBertTokenizer.from_pretrained(model_path)
loaded_label_encoder = joblib.load(label_encoder_path)

In [None]:
from transformers import pipeline

# Function to classify input text
def classify_text(text, model, tokenizer, label_encoder):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Decode the predicted label using the loaded label_encoder
    decoded_label = label_encoder.inverse_transform([predicted_class])[0]

    return decoded_label

In [None]:
# Example usage
input_text = """---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-26-63fbef0fd172> in <cell line: 6>()
      4 tokenizer.save_pretrained(model_path)
      5 label_encoder_path = 'label_encoder.pkl'
----> 6 label_encoder.save(label_encoder_path)

AttributeError: 'LabelEncoder' object has no attribute 'save'"""
predicted_label = classify_text(input_text, loaded_model, loaded_tokenizer, loaded_label_encoder)

print(f"Predicted Label: {predicted_label}")

Predicted Label: ques


In [None]:
!zip -r /content/pnq.zip /content/distilbert_classifier

  adding: content/distilbert_classifier/ (stored 0%)
  adding: content/distilbert_classifier/model.safetensors (deflated 8%)
  adding: content/distilbert_classifier/special_tokens_map.json (deflated 42%)
  adding: content/distilbert_classifier/config.json (deflated 49%)
  adding: content/distilbert_classifier/tokenizer_config.json (deflated 75%)
  adding: content/distilbert_classifier/vocab.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("/content/pnq.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>