In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelWithLMHead

# Fixing the random seed
RANDOM_SEED = 1729
torch.manual_seed(RANDOM_SEED)

# CUDA option
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model/tokenizer name or path
model_name_or_path = 'turingmachine/hupd-t5-small'
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
# Model
model = AutoModelWithLMHead.from_pretrained(model_name_or_path)
model.to(device)

# Load the Excel file
df = pd.read_excel('/content/drive/MyDrive/All_Patent.xlsx')
df = df.head(50)

# Extract the desired columns
columns_to_keep = ['Filename', 'Abstract', 'Claims']  # Add other column names here
df_subset = df[columns_to_keep]

# Store the 'Abstract' and 'Claims' columns in one variable
abstracts = df['Abstract']

# List to store the generated text
generated_texts = []

# Generate text for each abstract
for abstract in abstracts:
    inputs = tokenizer(abstract, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=512, num_return_sequences=1, early_stopping=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_texts.append(generated_text)

# Add the generated text to the DataFrame
df_subset['Abstract_Summary'] = generated_texts

# Save the data frame to a new Excel file
output_file = '/content/drive/MyDrive/abstract_summary_code.xlsx'
df_subset.to_excel(output_file, index=False)