In [2]:
# 1. Install the transformers library
!pip install transformers -q

# 2. Import the tools
from transformers import pipeline

# 3. Download a more stable AI model
# Model: "Hate-speech-CNERG/bengali-abusive-MuRIL"
# This model uses MuRIL (Multilingual Representations for Indian Languages) by Google
print("Downloading the AI brain... (this make take 30-60 seconds)")
classifier = pipeline("text-classification", model="Hate-speech-CNERG/bengali-abusive-MuRIL")

# 4. Let's test it!
# LABEL_1 = Abusive/Hate
# LABEL_0 = Normal

# Test Case 1: Hate Speech
text_1 = "তুই একটা চোর এবং প্রতারক"  # (You are a thief and a fraud)
result_1 = classifier(text_1)

# Test Case 2: Normal Speech
text_2 = "আমি বাংলাদেশকে ভালোবাসি"   # (I love Bangladesh)
result_2 = classifier(text_2)

print(f"\nSentence: {text_1}")
print(f"Prediction: {result_1} (LABEL_1 means Abusive)")

print(f"\nSentence: {text_2}")
print(f"Prediction: {result_2} (LABEL_0 means Normal)")

Downloading the AI brain... (this make take 30-60 seconds)


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/950M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu



Sentence: তুই একটা চোর এবং প্রতারক
Prediction: [{'label': 'LABEL_1', 'score': 0.5720322132110596}] (LABEL_1 means Abusive)

Sentence: আমি বাংলাদেশকে ভালোবাসি
Prediction: [{'label': 'LABEL_0', 'score': 0.9787150621414185}] (LABEL_0 means Normal)


In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. CREATE A MINI DATASET
data = [
    {"text": "পদ্মা সেতুতে মানুষের মাথা লাগবে", "label": 1},      # Fake
    {"text": "করোনা ভাইরাসে টমেটো খেলে রোগ সারে", "label": 1},    # Fake
    {"text": "আমেরিকা বাংলাদেশের ভিসা বন্ধ করে দিয়েছে", "label": 1}, # Fake
    {"text": "আগামীকাল থেকে স্কুল কলেজ বন্ধ ঘোষণা", "label": 1},    # Fake
    {"text": "ইলিশ মাছের দাম কমেছে", "label": 0},                # Real
    {"text": "আজ ঢাকায় বৃষ্টি হতে পারে", "label": 0},               # Real
    {"text": "বাংলাদেশ ক্রিকেট দল জিতেছে", "label": 0},             # Real
    {"text": "প্রধানমন্ত্রী আজ ভাষণ দেবেন", "label": 0}              # Real
]

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# 2. PREPARE THE BRAIN
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# --- THE FIX IS HERE ---
def tokenize_function(examples):
    # We force every sentence to be exactly 128 tokens long
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# -----------------------

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. SET UP THE TEACHER
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Small batch size for Colab
    report_to="none"                # Stops the "wandb" popup you saw earlier
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# 4. START TRAINING
print("Started Training...")
trainer.train()
print("Training Complete! You have successfully fine-tuned a Transformer model.")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training...


Step,Training Loss


Training Complete! You have successfully fine-tuned a Transformer model.


In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. LOAD THE REAL DATA
# We read the file you just uploaded
print("Loading dataset...")
try:
    df = pd.read_csv("data.csv")
    print(f"Success! Loaded {len(df)} news articles.")
except:
    print("ERROR: I cannot find 'data.csv'. Did you upload it and rename it?")

# 2. CLEAN THE DATA (Crucial Step)
# The dataset might have different column names. We standardize them.
# We look for columns like "content" or "headline" and "label"
if "content" in df.columns:
    df = df.rename(columns={"content": "text"})
elif "headline" in df.columns:
    df = df.rename(columns={"headline": "text"})

# Filter to ensure we only have text and labels
df = df[["text", "label"]]
df = df.dropna() # Remove empty rows

# Take a smaller sample for speed (Optional - remove this line to train on EVERYTHING)
# For now, let's train on 2,000 random articles to verify it works quickly.
df = df.sample(2000)

dataset = Dataset.from_pandas(df)

# 3. SPLIT DATA (Train vs Test)
# We keep 20% of data hidden to test the AI later
dataset = dataset.train_test_split(test_size=0.2)

# 4. PREPARE THE BRAIN
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. SET UP THE TEACHER
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,              # 2 loops over the data
    per_device_train_batch_size=8,   # Process 8 articles at a time
    evaluation_strategy="epoch",     # Test the model after every loop
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# 6. START REAL TRAINING
print("Starting training on Real Data...")
trainer.train()

Loading dataset...
ERROR: I cannot find 'data.csv'. Did you upload it and rename it?


ValueError: Cannot take a larger sample than population when 'replace=False'

In [6]:
import os
print("Current files in directory:")
print(os.listdir("."))

Current files in directory:
['.config', 'data.csv', 'wandb', 'results', 'sample_data']


In [7]:
import pandas as pd

# 1. Read the file
print("Reading data.csv...")
try:
    df = pd.read_csv("data.csv")
    print(f"✅ Loaded successfully! Total rows: {len(df)}")

    # 2. Show the column names
    print("\nColumn Names found:", list(df.columns))

    # 3. Show the first few rows
    print("\nFirst 5 rows:")
    print(df.head())

except Exception as e:
    print("❌ Error reading the file. Details:")
    print(e)
    print("\nTIP: Did you rename a .ZIP file to .csv? That won't work. You must unzip it first.")

Reading data.csv...
❌ Error reading the file. Details:
Error tokenizing data. C error: EOF inside string starting at row 7428

TIP: Did you rename a .ZIP file to .csv? That won't work. You must unzip it first.


In [8]:
import pandas as pd

print("Attempting to fix and load data.csv...")

try:
    # 1. We use engine='python' which is more forgiving with quotes
    # 2. on_bad_lines='skip' tells it to ignore the broken row 7428
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")

    print(f"✅ Success! We skipped the broken lines. Loaded {len(df)} rows.")

    # Show us the columns so we can finally start training
    print("\nColumn Names found:", list(df.columns))
    print("\nFirst 5 rows:")
    print(df.head())

except Exception as e:
    print("❌ Still failed. Let's try one more trick.")
    print(e)

Attempting to fix and load data.csv...
✅ Success! We skipped the broken lines. Loaded 9513 rows.

Column Names found: ['articleID', 'domain', 'date', 'category', 'headline', 'content', 'label']

First 5 rows:
   articleID          domain                 date   category  \
0          1  jagonews24.com  2018-09-19 17:48:18  Education   
1          2  jagonews24.com  2018-09-19 17:48:19   National   
2          3  jagonews24.com  2018-09-19 17:48:20   National   
3          4  jagonews24.com  2018-09-19 17:48:21      Crime   
4          5  jagonews24.com  2018-09-19 17:48:21   National   

                                            headline  \
0   হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত, ৬ জনকে শোকজ   
1    মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারিশ   
2  প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...   
3  মেডিয়েশনই মামলাজট নিরসনের পথ : বিচারপতি আহমেদ ...   
4         টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু   

                                             content  label  
0  গত ১

In [9]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import sys

# 1. LOAD THE DATA (With the fix you found)
print("Loading data...")
try:
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")
    print(f"✅ Loaded {len(df)} rows.")
except:
    print("❌ Error: Could not find data.csv")
    sys.exit()

# 2. PREPARE THE DATA
# We use the 'content' column (body of the news) and rename it to 'text' for the AI
df = df.rename(columns={"content": "text"})
df = df[["text", "label"]]
df = df.dropna()

# For speed, we train on 2,000 random articles.
# (For your final paper, you can remove this line to use all 9,000)
if len(df) > 2000:
    df = df.sample(2000)

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2) # 80% for training, 20% for testing

# 3. PREPARE THE BRAIN (BanglaBERT)
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Truncate to 128 tokens to keep it fast
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("Tokenizing data... (Converting text to numbers)")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. START TRAINING
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,              # Loop over data 2 times
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",     # Check accuracy after every loop
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

print("\n🚀 Starting Training... (This will take 5-10 minutes)")
trainer.train()

# 5. SHOW RESULTS
print("\n🎉 Training Complete!")
results = trainer.evaluate()
print(f"Final Accuracy Loss: {results['eval_loss']:.4f}")
print("(Lower loss is better. If it's under 0.5, your model is learning!)")

Loading data...


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



❌ Error: Could not find data.csv
Traceback (most recent call last):
  File "/tmp/ipython-input-2637374051.py", line 9, in <cell line: 0>
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 626, in _read
    return parser.read(nrows)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1923, in read
    ) = self._engine.read(  # type: ignore[attr-defined]
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/python_parser.py", line 252, in read
    content = self._get_lines(rows)
              ^^^^^^^^^^

TypeError: object of type 'NoneType' has no len()

In [10]:
import os

file_path = "data.csv"

if os.path.exists(file_path):
    size = os.path.getsize(file_path)
    print(f"File: {file_path}")
    print(f"Size: {size} bytes")

    if size == 0:
        print("❌ CRITICAL ERROR: The file is empty!")
        print("SOLUTION: Delete 'data.csv' from the sidebar and upload it again.")
    else:
        print("✅ File size looks okay. We can try reading it again.")
else:
    print("❌ File not found.")

File: data.csv
Size: 76546048 bytes
✅ File size looks okay. We can try reading it again.


In [11]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. LOAD DATA (Safer Method)
print("Loading data...")
try:
    # Try default UTF-8 first
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")
except:
    print("⚠️ Standard load failed. Trying alternate encoding...")
    # Fallback for weird file formats
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip", encoding="latin-1")

print(f"✅ Loaded {len(df)} rows.")

# 2. PREPARE DATA
# Rename 'content' to 'text' if needed
if "content" in df.columns:
    df = df.rename(columns={"content": "text"})
elif "headline" in df.columns:
    df = df.rename(columns={"headline": "text"})

# Ensure we have the right columns
df = df[["text", "label"]]
df = df.dropna()

# SAMPLE DATA (To make it fast for you right now)
if len(df) > 2000:
    df = df.sample(2000)

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# 3. PREPARE THE BRAIN
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. TRAIN
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

print("\n🚀 Starting Training...")
trainer.train()

Loading data...
⚠️ Standard load failed. Trying alternate encoding...
✅ Loaded 15836 rows.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [12]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. LOAD DATA (We use the method that worked for you just now)
print("Loading data...")
try:
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")
except:
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip", encoding="latin-1")

print(f"✅ Loaded {len(df)} rows.")

# 2. PREPARE DATA
if "content" in df.columns:
    df = df.rename(columns={"content": "text"})
elif "headline" in df.columns:
    df = df.rename(columns={"headline": "text"})

df = df[["text", "label"]]
df = df.dropna()

# Sample 2000 rows for speed
if len(df) > 2000:
    df = df.sample(2000)

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# 3. PREPARE BRAIN
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("Tokenizing...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. TRAIN
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    eval_strategy="epoch",  # <--- FIXED HERE
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

print("\n🚀 Starting Training... (Watch the Loss go down!)")
trainer.train()

Loading data...
✅ Loaded 17412 rows.
Tokenizing...


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Starting Training... (Watch the Loss go down!)




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. LOAD DATA (We use the method that worked for you just now)
print("Loading data...")
try:
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip")
except:
    df = pd.read_csv("data.csv", engine="python", on_bad_lines="skip", encoding="latin-1")

print(f"✅ Loaded {len(df)} rows.")

# 2. PREPARE DATA
if "content" in df.columns:
    df = df.rename(columns={"content": "text"})
elif "headline" in df.columns:
    df = df.rename(columns={"headline": "text"})

df = df[["text", "label"]]
df = df.dropna()

# Sample 2000 rows for speed
if len(df) > 2000:
    df = df.sample(2000)

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# 3. PREPARE BRAIN
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("Tokenizing...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. TRAIN
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    eval_strategy="epoch",  # <--- FIXED HERE
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

print("\n🚀 Starting Training... (Watch the Loss go down!)")
trainer.train()

Loading data...
✅ Loaded 8856 rows.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Tokenizing...


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Starting Training... (Watch the Loss go down!)


Epoch,Training Loss,Validation Loss
1,No log,1.7e-05
2,No log,1.2e-05


TrainOutput(global_step=400, training_loss=0.001264352947473526, metrics={'train_runtime': 107.8138, 'train_samples_per_second': 29.681, 'train_steps_per_second': 3.71, 'total_flos': 210488844288000.0, 'train_loss': 0.001264352947473526, 'epoch': 2.0})

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Get predictions from the model
print("Testing model on 400 unseen news articles...")
predictions = trainer.predict(tokenized_datasets["test"])

# 2. Convert the 'logits' (raw scores) into '0' or '1'
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

# 3. Calculate the metrics
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)

print(f"\n🏆 FINAL RESULTS FOR YOUR PAPER 🏆")
print(f"-----------------------------------")
print(f"Accuracy:  {acc * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall:    {recall * 100:.2f}%")
print(f"F1 Score:  {f1 * 100:.2f}%")
print(f"-----------------------------------")

Testing model on 400 unseen news articles...



🏆 FINAL RESULTS FOR YOUR PAPER 🏆
-----------------------------------
Accuracy:  100.00%
Precision: 100.00%
Recall:    100.00%
F1 Score:  100.00%
-----------------------------------
