In [66]:
import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration

In [2]:
# specify data path and change directory
data_path = 'Data'
os.chdir(data_path)

In [3]:
# *** Note: added an "Introduction" line to S1387700313001822.txt to 
# make reading in data easier. This file was the only one to not 
# include Introduction line ***
# *** Note: switched "Highlights" and "Abstract" paragraph 
# locations to make reading in data easier for S016816561300552X.txt, 
# S1161030113001950.txt, and S1750583613004192.txt

In [55]:
# separate titles, abstracts, and texts
titles = []
abstracts = []
texts = []

for filename in os.listdir():
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            title = lines[0].strip()
            titles.append(title)
            
            abstract_text = ''
            intro_text = ''
            capturing_abstract = False
            capturing_intro = False
            
            # Loop through lines to extract abstracts and texts
            for line in lines:
                if 'Abstract' in line:
                    capturing_abstract = True
                elif 'Introduction' in line:
                    capturing_intro = True
                
                if capturing_abstract and not capturing_intro:
                    abstract_text += line.strip() + ' '
                elif capturing_intro:
                    intro_text += line.strip() + ' '
            
            abstracts.append(abstract_text)
            texts.append(intro_text)

In [62]:
print("Titles:\n", titles)
print("Abstracts:\n", abstracts)
print("Texts:\n", texts)

Titles:
 ['Sylow p-groups of polynomial permutations on the integers mod pn', 'The transterminator ion flow at Venus at solar minimum', 'The modelling of the toughening of epoxy polymers via silica nanoparticles: The effects of volume fraction and particle size', 'Flow structure and near-field dispersion in arrays of building-like obstacles', 'A memory access model for highly-threaded many-core architectures', 'Investigating the feasibility of scale up and automation of human induced pluripotent stem cells cultured in aggregates in feeder free conditions', 'Phosphorus levels in croplands of the European Union with implications for P fertilizer use', 'Chirality delivery through multiple and helical H-bonding from chiral coordination complex to its supramolecular architecture', 'The Sleipner storage site: Capillary flow modeling of a layered CO2 plume requires fractured shale barriers within the Utsira Formation', 'Combined analysis of sMRI and fMRI imaging data provides accurate disease

In [72]:
# put data into pandas DataFrame
data = pd.DataFrame({"title": titles,
                     "abstract": abstracts,
                     "text": texts})
data

Unnamed: 0,title,abstract,text
0,Sylow p-groups of polynomial permutations on t...,Abstract We enumerate and describe the Sylow p...,Introduction Fix a prime p and let n∈N. Every ...
1,The transterminator ion flow at Venus at solar...,Abstract The transterminator ion flow in the V...,Introduction The nightside ionosphere of Venus...
2,The modelling of the toughening of epoxy polym...,Abstract Silica nanoparticles possessing three...,Introduction Epoxy polymers are widely used in...
3,Flow structure and near-field dispersion in ar...,Abstract Dispersion in the near-field region o...,Introduction Understanding dispersion processe...
4,A memory access model for highly-threaded many...,"Abstract A number of highly-threaded, many-cor...","Introduction Highly-threaded, many-core device..."
5,Investigating the feasibility of scale up and ...,Abstract The transfer of a laboratory process ...,Introduction Human induced pluripotent stem ce...
6,Phosphorus levels in croplands of the European...,Abstract In the frame of the Land Use/Land Cov...,Introduction Soil represents a temporary reser...
7,Chirality delivery through multiple and helica...,Abstract The path of the chirality delivery in...,Introduction The chirality delivery is a growi...
8,The Sleipner storage site: Capillary flow mode...,Abstract To prevent ocean acidification and mi...,Introduction Climate change and ocean acidific...
9,Combined analysis of sMRI and fMRI imaging dat...,"Abstract In this research, we developed a robu...",Introduction It has been estimated that approx...


In [73]:
# initialize tokenizer & BART model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [74]:
# Function to generate summaries
def generate_summary(row):
    input_text = f"{row['abstract']} {row['text']}"  # Combine abstract and text for summarization
    tokenized_input = tokenizer([input_text], max_length=1024, return_tensors='pt')
    summary_ids = model.generate(tokenized_input['input_ids'], num_beams=4, max_length=150, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [75]:
# Apply the function to each row in the DataFrame to generate summaries
data['summary'] = data.apply(generate_summary, axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [76]:
print(data)

                                               title  \
0  Sylow p-groups of polynomial permutations on t...   
1  The transterminator ion flow at Venus at solar...   
2  The modelling of the toughening of epoxy polym...   
3  Flow structure and near-field dispersion in ar...   
4  A memory access model for highly-threaded many...   
5  Investigating the feasibility of scale up and ...   
6  Phosphorus levels in croplands of the European...   
7  Chirality delivery through multiple and helica...   
8  The Sleipner storage site: Capillary flow mode...   
9  Combined analysis of sMRI and fMRI imaging dat...   

                                            abstract  \
0  Abstract We enumerate and describe the Sylow p...   
1  Abstract The transterminator ion flow in the V...   
2  Abstract Silica nanoparticles possessing three...   
3  Abstract Dispersion in the near-field region o...   
4  Abstract A number of highly-threaded, many-cor...   
5  Abstract The transfer of a laboratory proces