In [1]:
!pip install transformers
!pip install torch




In [2]:
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch

In [3]:
file_path = 'merged_data.csv'
df = pd.read_csv(file_path)

In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kkysh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kkysh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
df['Text'] = df['Text'].fillna('')

In [8]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [9]:
df['Text'] = df['Text'].apply(preprocess_text)

In [10]:

for index, row in df.head(10).iterrows():
    print(f"Original Text: {row['Text']}")
    print(f"Preprocessed Text: {preprocess_text(row['Text'])}\n")


Original Text: usual would tear around live room play toy one look minion sent practic cataton megan plan got dress earlier seen movi almost mistak consid littl young pg cartoon older cousin along brother mason often expos thing older like think surround adult older kid one reason good talker age good boy said bare acknowledg babi blue remain focus televis movi almost megan knew better slip bedroom finish get readi time look mason face grate look noth like father platinum blond hair blue eye complet build take father megan diminut 5 3 davi 6 1 two hundr pound alreadi regist chart height weight accord pediatrician seen mason twice day born day came home hospit interest pictur email megan sent profession footbal career rise davi want shackl respons babi want spend time field parti hour night paid child support megan threaten wage garnish dread day mason old enough ask father never want anyth world hurt knew reject father would sigh step dress slid hip around get zipper way caus huff puff

In [17]:
df_shuffled = df.sample(frac=1, random_state=42)
subset_df = df_shuffled.head(5000)

print(subset_df)


                                                    Text  Label
26541  procedur perform dddr perman insert screw righ...    6.0
8588   respond 1 13 assist engin state madhya pradesh...    5.0
54173  struction detect input word match mask corresp...    7.0
55043  frontier commun corpor satellit oper dish netw...    2.0
53437  rnce signal speed ref speed limit speed ref sp...    7.0
...                                                  ...    ...
3767   sleep well ever sinc arriv feel new take grant...    3.0
1118   servant happi wo sleep zayn neither sabotag co...    3.0
21546  kerala forest act 1961 regul preserv forest fo...    5.0
16809  hindu execut direct wife sell properti utilis ...    5.0
7950   respond 1 obtain mortgag decre rs one rao raja...    5.0

[5000 rows x 2 columns]


In [18]:
subset_df.to_csv('merged_data_subset.csv', index=False)

In [21]:
label_distribution = subset_df['Label'].value_counts()

print("Label Distribution:")
print(label_distribution)


Label Distribution:
Label
5.0    1309
1.0    1096
4.0     983
3.0     483
6.0     390
2.0     348
7.0     250
8.0     141
Name: count, dtype: int64


In [33]:
df['Label'] = df['Label'] - 1
train_df, val_df = train_test_split(subset_df, test_size=0.2, random_state=42)

In [34]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=8)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
train_encodings = tokenizer(list(train_df['Text']), truncation=True, padding=True, max_length=256, return_tensors='pt')
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_df['Label'].values, dtype=torch.long)  # Assuming labels are integers
)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

  torch.tensor(train_encodings['input_ids']),
  torch.tensor(train_encodings['attention_mask']),


In [36]:
val_encodings = tokenizer(list(val_df['Text']), truncation=True, padding=True, max_length=256, return_tensors='pt')
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_df['Label'].values, dtype=torch.long)  # Assuming labels are integers
)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

  torch.tensor(val_encodings['input_ids']),
  torch.tensor(val_encodings['attention_mask']),


In [37]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)



In [31]:
num_epochs = 3  # Adjust as needed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

[2. 7. 4. 5. 0. 3. 6. 1.]


In [38]:
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=list(categories.values()))
    
    print(f"Epoch {epoch + 1}, Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

Epoch 1 Training:   0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1 Training:   3%|▎         | 14/500 [06:26<3:43:39, 27.61s/it]


IndexError: Target 8 is out of bounds.