In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sector/sector_unfiltered.csv


In [2]:
!pip install transformers --upgrade



In [3]:
!pip install peft datasets evaluate



In [4]:
!pip install accelerate



In [5]:
pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [6]:
df = pd.read_csv("/kaggle/input/sector/sector_unfiltered.csv")
df.head()

Unnamed: 0,id,text,vecID,summary_article,label,Comments,title,Words Per Article
0,57749,", /PRNewswire/ -- Boston Scientific (NYSE: ) h...",{'$numberLong': '8674496742022912'},This is the second consecutive year Boston Sci...,['Medical Technology'],[],,390
1,57750,"NEW YORK, Dec 22 (Reuters) - Nestle SA said on...",{'$numberLong': '8088413574956013'},”Bakus said Nestle had prioritized making the ...,['Food & Beverages Retail'],[],,390
2,57751,"BERLIN/PARIS (Reuters) - Eat misshapen veg, wa...",{'$numberLong': '8045220543830533'},“It is far easier to get a consumer to switch ...,['Consumer Services'],[],,1025
3,57752,Gift ArticleSharePope Francis has been called ...,{'$numberLong': '8011134654570867'},"“Sadly, even human rights can be used as a jus...",['None matched'],['this article is based on topic religion'],,1406
4,57753,"Quick! What do Italian jackets, British beef,...",{'$numberLong': '895805351371523'},"Quick! What do Italian jackets, British beef,...",['Forestry & Paper'],[],,527


In [7]:
df=df[['text','label']]

In [8]:
df.head()

Unnamed: 0,text,label
0,", /PRNewswire/ -- Boston Scientific (NYSE: ) h...",['Medical Technology']
1,"NEW YORK, Dec 22 (Reuters) - Nestle SA said on...",['Food & Beverages Retail']
2,"BERLIN/PARIS (Reuters) - Eat misshapen veg, wa...",['Consumer Services']
3,Gift ArticleSharePope Francis has been called ...,['None matched']
4,"Quick! What do Italian jackets, British beef,...",['Forestry & Paper']


In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['label']=label_encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,text,label
0,", /PRNewswire/ -- Boston Scientific (NYSE: ) h...",27
1,"NEW YORK, Dec 22 (Reuters) - Nestle SA said on...",14
2,"BERLIN/PARIS (Reuters) - Eat misshapen veg, wa...",12
3,Gift ArticleSharePope Francis has been called ...,29
4,"Quick! What do Italian jackets, British beef,...",16


In [10]:
df=pd.DataFrame(df)

In [11]:
from datasets import Dataset
dataset=Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 6096
})

In [12]:
dataset.info.features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [13]:
dataset=dataset.train_test_split(test_size=0.3,shuffle=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4267
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1829
    })
})

In [14]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification,DataCollatorWithPadding,TrainingArguments,Trainer

2024-03-12 07:11:25.343707: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 07:11:25.343761: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 07:11:25.345191: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
model_id='roberta-base'
model=AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=len(df['label'].unique()))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
tokenizer=AutoTokenizer.from_pretrained(model_id)

In [17]:
from peft import LoraConfig, get_peft_model, TaskType
lora_config=LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

In [18]:
peft_model=get_peft_model(model,lora_config)


In [19]:
peft_model.print_trainable_parameters()

trainable params: 1,800,231 || all params: 126,475,854 || trainable%: 1.4233792009026482


In [20]:
def preprocess(examples):
    tokenized=tokenizer(examples['text'],truncation=True,padding=True)
    return tokenized

In [21]:
tokenized_dataset=dataset.map(preprocess,batched=True,batch_size=None,remove_columns=['text'])
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 4267
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1829
    })
})

In [22]:
train_dataset=tokenized_dataset['train']
eval_dataset=tokenized_dataset['test'].shard(num_shards=2,index=0)
test_dataset=tokenized_dataset['test'].shard(num_shards=2,index=1)

In [23]:
print(train_dataset)
print('-----------')
print(test_dataset)
print('-----------')
print(eval_dataset)

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 4267
})
-----------
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 914
})
-----------
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 915
})


In [24]:
data_collector=DataCollatorWithPadding(tokenizer=tokenizer,return_tensors='pt')

In [25]:
import evaluate

metric=evaluate.load('accuracy')

In [26]:
def compute_metrics(eval_pred):
    logits,labels=eval_pred
    predictions=np.argmax(logits,axis=-1)
    return metric._compute(predictions=predictions,references=labels)

In [37]:
training_args=TrainingArguments(output_dir="Lora__trainer",
                                evaluation_strategy="epoch"
                                ,num_train_epochs=3,
                               )#load_best_model_at_end=True

In [38]:
trainer=Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collector
)

In [39]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1458,1.13487,0.699454
2,1.0262,1.093673,0.693989
3,0.9752,1.086299,0.692896


TrainOutput(global_step=1602, training_loss=1.0437852047504705, metrics={'train_runtime': 592.1796, 'train_samples_per_second': 21.617, 'train_steps_per_second': 2.705, 'total_flos': 3439997018155008.0, 'train_loss': 1.0437852047504705, 'epoch': 3.0})

In [40]:
test_predictions=trainer.predict(test_dataset)

In [41]:
preds=np.argmax(test_predictions.predictions,axis=-1)

In [42]:
metric.compute(predictions=preds, references=test_predictions.label_ids)

{'accuracy': 0.6903719912472648}

In [43]:

from sklearn.metrics import classification_report

In [44]:
y_true =test_predictions.label_ids
y_pred=preds

In [45]:
y_true=label_encoder.inverse_transform(y_true)
y_pred=label_encoder.inverse_transform(y_pred)

In [46]:

print(classification_report(y_true, y_pred))

                                     precision    recall  f1-score   support

             ['Air Transportation']       0.85      0.94      0.89        18
             ['Alternative Energy']       0.50      0.45      0.48        22
             ['Apparel & Textiles']       0.91      0.90      0.91        59
                    ['Automobiles']       0.75      0.67      0.71         9
                      ['Beverages']       1.00      0.50      0.67         4
['Biotechnology & Pharmaceuticals']       0.60      0.76      0.67        33
                ['Capital Markets']       0.65      0.65      0.65        54
                      ['Chemicals']       0.00      0.00      0.00         3
                           ['Coal']       1.00      0.73      0.84        11
         ['Construction Materials']       0.00      0.00      0.00         2
['Consumer Discretionary Products']       0.67      0.69      0.68        29
          ['Consumer Goods Retail']       0.36      0.31      0.33        1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
