In [2]:
!pip install datasets



In [3]:
# Import necessary libraries
import pandas as pd


dataset_path = '/content/Wikipedia.zip'

import sys
sys.path.append(dataset_path)

import datasets
# Load your dataset
from datasets import load_dataset, Dataset
from transformers import pipeline

df = pd.read_csv(dataset_path)


In [4]:
df.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [6]:
df['Plot']

0        A bartender is working at a saloon, serving dr...
1        The moon, painted with a smiling face hangs ov...
2        The film, just over a minute long, is composed...
3        Lasting just 61 seconds and consisting of two ...
4        The earliest known adaptation of the classic f...
                               ...                        
34881    The film begins in 1919, just after World War ...
34882    Two musicians, Salih and Gürkan, described the...
34883    Zafer, a sailor living with his mother Döndü i...
34884    The film centres around a young woman named Am...
34885    The writer Orhan Şahin returns to İstanbul aft...
Name: Plot, Length: 34886, dtype: object

# **Text Summarization**

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')


inputs = tokenizer("summarize: " + df['Plot'][2], return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

in the first shot, a girl sits at the base of an altar or tomb. at the center of the altar, a viewing portal displays portraits of three presidents. in the second shot, an assassin kneels feet of Lady justice.


# **Named Entity Recognition (NER)**

In [9]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
ner_results = ner_pipeline(df['Plot'][0])
print(ner_results)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-MISC', 'score': 0.99806994, 'index': 22, 'word': 'Irish', 'start': 98, 'end': 103}, {'entity': 'I-PER', 'score': 0.9987496, 'index': 30, 'word': 'Carrie', 'start': 128, 'end': 134}, {'entity': 'I-PER', 'score': 0.9994307, 'index': 31, 'word': 'Nation', 'start': 135, 'end': 141}, {'entity': 'I-MISC', 'score': 0.99821067, 'index': 41, 'word': 'Irish', 'start': 191, 'end': 196}, {'entity': 'I-PER', 'score': 0.9993765, 'index': 92, 'word': 'Nation', 'start': 419, 'end': 425}]


# **Text Classification**

In [10]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from datasets import Dataset

# Example of encoding genres
genre_labels = {genre: id for id, genre in enumerate(df['Genre'].unique())}
df['genre_id'] = df['Genre'].map(genre_labels)

# Preparing dataset for training
train_df, test_df = train_test_split(df, test_size=0.1)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(genre_labels))

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["Plot"], padding="max_length", truncation=True)

# tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
# tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# #Define training arguments
# training_args = TrainingArguments(
#                   output_dir="./results",
#                   learning_rate=2e-5,
#                   per_device_train_batch_size=4,
#                   per_device_eval_batch_size=4,
#                   num_train_epochs=3,
#                   weight_decay=0.01,
#                   evaluation_strategy="epoch",
#                 )

# #Define the Trainer
# trainer = Trainer(
#                     model=model,
#                     args=training_args,
#                     train_dataset=tokenized_train_dataset,
#                     eval_dataset=tokenized_test_dataset,
#                   )

# #Train the model
# trainer.train()


In [11]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["Plot"], padding=True, truncation=True)

# Tokenize the dataset
tokenized_train_dataset = train_dataset.map(tokenize, batched=True, batch_size=None)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/31397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3489 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoModelForSequenceClassification

num_labels = len(genre_labels)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#update the accelerate package
!pip install accelerate -U

!pip install transformers -U




In [14]:
# Updated Tokenization function that also includes the labels
def tokenize_and_include_labels(batch):
    tokenized_inputs = tokenizer(batch["Plot"], padding=True, truncation=True)
    # Assuming 'genre_id' is the name of the column with your target labels
    tokenized_inputs['labels'] = batch['genre_id']
    return tokenized_inputs

# Apply the updated function to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_include_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_include_labels, batched=True)


Map:   0%|          | 0/31397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3489 [00:00<?, ? examples/s]

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,              # Total number of training epochs
    per_device_train_batch_size=30,   # Batch size per device during training
    per_device_eval_batch_size=30,    # Batch size for evaluation
    warmup_steps=10,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=5,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
    load_best_model_at_end=True,     # Load the best model at the end of training
)

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=None,  # You can define a function to compute metrics, for instance, accuracy.
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()