In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [17]:
!pip install transformers torch pandas scikit-learn joblib tqdm python-docx



In [18]:
from docx import Document

def load_text_from_word(file_path):
    document = Document(file_path)
    text = [paragraph.text for paragraph in document.paragraphs]
    return text

# Load technical and non-technical text
technical_text = load_text_from_word("technical.docx")
non_technical_text = load_text_from_word("non-technical.docx")

In [19]:
import pandas as pd

# Create DataFrames
technical_df = pd.DataFrame({'text': technical_text, 'label': 'technical'})
non_technical_df = pd.DataFrame({'text': non_technical_text, 'label': 'non_technical'})

# Concatenate the two datasets
df = pd.concat([technical_df, non_technical_df], ignore_index=True)

In [20]:
df

Unnamed: 0,text,label
0,How can I optimize SQL queries for improved da...,technical
1,How can I set up continuous integration and co...,technical
2,What is the role of a load balancer in a distr...,technical
3,"How do neural networks work, and how can I des...",technical
4,What are the main components of a microservice...,technical
...,...,...
3793,Reflecting on a moment when you encouraged som...,non_technical
3794,Talking about your favorite type of social jus...,non_technical
3795,Sharing your perspective on the importance of ...,non_technical
3796,Discussing a lesson you learned from a diverse...,non_technical


In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

In [22]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['encoded_label'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['encoded_label'], random_state=42)

In [23]:
train_df['label'].value_counts()

technical        1838
non_technical    1200
Name: label, dtype: int64

In [24]:
test_df['label'].value_counts()

technical        230
non_technical    150
Name: label, dtype: int64

In [25]:
val_df['label'].value_counts()

technical        230
non_technical    150
Name: label, dtype: int64

In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the data
# (Assuming you've already split the data into train_df, test_df, and val_df)

# Tokenizer and model initialization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['encoded_label'].unique()))

# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

# Tokenize and prepare input data
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

def collate_fn(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': torch.tensor(labels)}

# Split the data into train, test, and validation sets
train_dataset = CustomDataset(train_df['text'].tolist(), train_df['encoded_label'].tolist())
test_dataset = CustomDataset(test_df['text'].tolist(), test_df['encoded_label'].tolist())
val_dataset = CustomDataset(val_df['text'].tolist(), val_df['encoded_label'].tolist())

# DataLoader for efficient training
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation {epoch + 1}/{num_epochs}'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}, Validation Accuracy: {accuracy}')

# Testing
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {test_accuracy}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 380/380 [00:26<00:00, 14.48it/s]
Validation 1/3: 100%|██████████| 48/48 [00:01<00:00, 30.30it/s]


Epoch 1/3, Train Loss: 0.09294130688864051, Validation Accuracy: 1.0


Epoch 2/3: 100%|██████████| 380/380 [00:20<00:00, 18.92it/s]
Validation 2/3: 100%|██████████| 48/48 [00:00<00:00, 92.07it/s]


Epoch 2/3, Train Loss: 0.0023305020442134458, Validation Accuracy: 1.0


Epoch 3/3: 100%|██████████| 380/380 [00:18<00:00, 21.09it/s]
Validation 3/3: 100%|██████████| 48/48 [00:00<00:00, 92.22it/s]


Epoch 3/3, Train Loss: 0.0009336068500966863, Validation Accuracy: 1.0


Testing: 100%|██████████| 48/48 [00:00<00:00, 93.80it/s]

Test Accuracy: 1.0





In [27]:
import joblib
# Save the trained model
model_path = 'distilbert_classifier'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
label_encoder_path = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)

['label_encoder.pkl']

In [28]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import joblib

# Load the saved model
loaded_model = DistilBertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = DistilBertTokenizer.from_pretrained(model_path)
loaded_label_encoder = joblib.load(label_encoder_path)

In [29]:
from transformers import pipeline

# Function to classify input text
def classify_intent(text, model, tokenizer, label_encoder):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Decode the predicted label using the loaded label_encoder
    decoded_label = label_encoder.inverse_transform([predicted_class])[0]

    return decoded_label

In [39]:
# Example usage
input_text = "How to kill someone using technology?"
predicted_label = classify_intent(input_text, loaded_model, loaded_tokenizer, loaded_label_encoder)

print(f"Predicted Label: {predicted_label}")

Predicted Label: non_technical


In [31]:
!zip -r /content/tech_nontech.zip /content/distilbert_classifier/

updating: content/distilbert_classifier/ (stored 0%)
updating: content/distilbert_classifier/model.safetensors (deflated 8%)
updating: content/distilbert_classifier/special_tokens_map.json (deflated 42%)
updating: content/distilbert_classifier/config.json (deflated 46%)
updating: content/distilbert_classifier/tokenizer_config.json (deflated 75%)
updating: content/distilbert_classifier/vocab.txt (deflated 53%)


In [32]:
from google.colab import files
files.download("/content/tech_nontech.zip")
files.download("/content/label_encoder.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>