In [1]:
!pip install tf-keras



In [3]:
!pip install torch



In [7]:
from datasets import load_dataset

In [11]:
dataset = load_dataset(
    'csv', 
    data_files={'train': 'train.tsv', 'val': 'val.tsv','test': 'test.tsv'},
    delimiter='\t'
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 3130
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3132
    })
})

In [22]:
##Natural Language Processing
##Generate the Tokenizer and Data Collator

In [20]:
from transformers import DistilBertTokenizer, DataCollatorWithPadding


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


sample_texts = ["This is a sample sentence.", "This is another example sentence."]
tokenized_inputs = tokenizer(sample_texts, padding=True, truncation=True, max_length=15)

print(tokenized_inputs)

{'input_ids': [[101, 2023, 2003, 1037, 7099, 6251, 1012, 102], [101, 2023, 2003, 2178, 2742, 6251, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [24]:
##Tokenize the dataset

In [26]:
from datasets import load_dataset
from transformers import DistilBertTokenizer

In [28]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, max_length=15, padding='max_length')


tokenized_datasets = dataset.map(tokenize_function, batched=True)


print(tokenized_datasets['train'][0])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3130 [00:00<?, ? examples/s]

Map:   0%|          | 0/3132 [00:00<?, ? examples/s]

{'text': 'both sides need to calm the fuck down or we are heading for dark times .', 'label': 1, 'input_ids': [101, 2119, 3903, 2342, 2000, 5475, 1996, 6616, 2091, 2030, 2057, 2024, 5825, 2005, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [32]:
#Define the model

In [30]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
print(model)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [38]:
##Define the training arguments

In [81]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',           
    evaluation_strategy='epoch',       
    per_device_train_batch_size=128,  
    per_device_eval_batch_size=128,    
    )
print(training_args)


TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [40]:
##Load the metrics

In [48]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [50]:
import evaluate

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


def compute_metrics(p):
    logits, labels = p
    predictions = logits.argmax(axis=-1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [54]:
## Initialize the Trainer

In [60]:
from transformers import Trainer

In [62]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_datasets['train'],         
    eval_dataset=tokenized_datasets['val'],            
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.510538,0.85016,0.856771,0.840895,0.848758
2,No log,0.315119,0.886901,0.939086,0.827476,0.879755
3,No log,0.281275,0.894249,0.924931,0.858147,0.890288


TrainOutput(global_step=24, training_loss=0.44787728786468506, metrics={'train_runtime': 168.2467, 'train_samples_per_second': 17.831, 'train_steps_per_second': 0.143, 'total_flos': 11642642460000.0, 'train_loss': 0.44787728786468506, 'epoch': 3.0})

In [64]:
#Evaluate the model

In [66]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.2812749445438385, 'eval_accuracy': 0.8942492012779553, 'eval_precision': 0.9249311294765841, 'eval_recall': 0.85814696485623, 'eval_f1': 0.890288365926417, 'eval_runtime': 26.0323, 'eval_samples_per_second': 120.235, 'eval_steps_per_second': 0.96, 'epoch': 3.0}


In [69]:
test_results = trainer.predict(tokenized_datasets['test'])

logits = test_results.predictions
true_labels = test_results.label_ids
metrics = test_results.metrics

print("Test Metrics:", metrics)

Test Metrics: {'test_loss': 0.2834629416465759, 'test_accuracy': 0.894955300127714, 'test_precision': 0.9316120027913468, 'test_recall': 0.8524904214559387, 'test_f1': 0.8902967655885295, 'test_runtime': 26.5044, 'test_samples_per_second': 118.169, 'test_steps_per_second': 0.943}


In [71]:
#Extract the predictions (class 0 or 1) from the logits

In [73]:
import numpy as np

predicted_classes = np.argmax(logits, axis=-1)
print("Predicted Classes (sample):", predicted_classes[:10])
print("True Labels (sample):", true_labels[:10])


Predicted Classes (sample): [0 1 0 1 1 1 1 1 1 1]
True Labels (sample): [1 1 1 1 1 1 1 1 1 1]


In [77]:
from sklearn.metrics import classification_report, confusion_matrix

report = classification_report(true_labels, predicted_classes, target_names=['Class 0', 'Class 1'])
print("Classification Report:\n", report)

conf_matrix = confusion_matrix(true_labels, predicted_classes)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

     Class 0       0.86      0.94      0.90      1566
     Class 1       0.93      0.85      0.89      1566

    accuracy                           0.89      3132
   macro avg       0.90      0.89      0.89      3132
weighted avg       0.90      0.89      0.89      3132

Confusion Matrix:
 [[1468   98]
 [ 231 1335]]


In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [85]:
pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    MultinomialNB()
)

In [87]:
pipeline.fit(dataset['train']['text'], dataset['train']['label'])
from sklearn.metrics import classification_report
pipeline.predict(dataset['train']['text'])
print(classification_report(dataset['train']['text'],dataset['train']['text']))

                                                                                                                   precision    recall  f1-score   support

                                                                           " * * where the fuck do you live ? * *       1.00      1.00      1.00         1
                                                              " i 'm drunk as shit , i 'm soooo hammered dude " ?       1.00      1.00      1.00         1
                                                            ( gives a whole new meaning to save you doesnt it ! )       1.00      1.00      1.00         1
                                                                                * * they hit the fucking pentagon       1.00      1.00      1.00         1
                                                       * how to embed fonts in fucking microsoft word documents .       1.00      1.00      1.00         2
                                                                     