In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
pip install torch



In [None]:
# Step 1: Load and Preprocess the Data (same as before)
import pandas as pd
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch

# Load the dataset
df = pd.read_csv('Tweets.csv')

# Define functions for preprocessing (same as before)
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

def tokenize(text):
    return text.split()

def join_tokens(tokens):
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)
df['tokens'] = df['clean_text'].apply(tokenize)
df['clean_text'] = df['tokens'].apply(join_tokens)

# Step 2: Split Data into Training and Testing Sets (same as before)
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['airline_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Load Pre-trained Transformer Model (DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Step 4: Tokenize and Encode Text Data (same as before)
X_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
X_test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')

# Step 5: Fine-tune the Model (same as before)
optimizer = AdamW(model.parameters(), lr=1e-5)
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y_train_tensor = torch.tensor(y_train.map(label_map).values, dtype=torch.long)

# Define batch size
batch_size = 8

# Train the model (loop over epochs and batches)
for epoch in range(1):  # Adjust as needed
    for i in range(0, len(y_train_tensor), batch_size):
        optimizer.zero_grad()
        outputs = model(input_ids=X_train_encodings['input_ids'][i:i+batch_size],
                        attention_mask=X_train_encodings['attention_mask'][i:i+batch_size],
                        labels=y_train_tensor[i:i+batch_size])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Step 6: Evaluate the Model (same as before)
model.eval()
y_test_tensor = torch.tensor(y_test.map(label_map).values, dtype=torch.long)

with torch.no_grad():
    outputs = model(input_ids=X_test_encodings['input_ids'], attention_mask=X_test_encodings['attention_mask'])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

accuracy = (predictions == y_test_tensor).sum().item() / len(y_test_tensor)
print(f'Accuracy: {accuracy:.2f}')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.84
