In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
from datasets import load_dataset
import pandas as pd

# Load the acronym_identification dataset
dataset = load_dataset("acronym_identification")

# Convert the train, validation, and test datasets to pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Add a column to identify the split
train_df['split'] = 'train'
validation_df['split'] = 'validation'


# Concatenate the DataFrames
combined_df = pd.concat([train_df, validation_df], ignore_index=True)

# Save the combined DataFrame as a CSV file
combined_df.to_csv('combined.csv', index=False)

print("Combined CSV file has been saved successfully!")


Downloading builder script:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/167k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/149k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14006 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1717 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1750 [00:00<?, ? examples/s]

Combined CSV file has been saved successfully!


In [15]:
data = combined_df

# Drop the 'split' column
data = data.drop('split', axis=1)

# Display first few rows of the data
data.head()

Unnamed: 0,id,tokens,labels
0,TR-0,"[What, is, here, called, controlled, natural, ...","[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, ..."
1,TR-1,"[In, this, paper, ,, we, introduce, CoQA, ,, a...","[4, 4, 4, 4, 4, 4, 1, 4, 4, 0, 2, 2, 4, 4, 4, ..."
2,TR-2,"[in, :, Proceedings, of, SODA, 98, ,, Ninth, A...","[4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, ..."
3,TR-3,"[But, is, the, existing, HPC, I, /, O, stack, ...","[4, 4, 4, 4, 1, 3, 3, 3, 4, 4, 4, 4, 4, 1, 4, ..."
4,TR-4,"[For, a, fixed, and, known, ,, we, apply, a, s...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 2, 4, 1, 4, ..."


In [16]:
def extract_acronyms_v2(tokens, labels):
    """
    Extract acronyms and their expansions based on labels and rule provided.

    Parameters:
    - tokens: list of tokenized words in a sentence
    - labels: list of labels corresponding to tokens

    Returns:
    - acronyms_with_expansions: list of strings containing "acronym: expansion"
    - is_followed_by_acronym_list: list containing 1 if acronym is followed by expansion else 0
    """
    acronyms_with_expansions = []
    is_followed_by_acronym_list = []
    expansion = []
    acronym = None

    for i in range(len(tokens)):
        if labels[i] == 0:  # Beginning of a full form
            expansion.append(tokens[i])
        elif labels[i] == 2:  # Continuing part of full form
            expansion.append(tokens[i])
        elif labels[i] == 1:  # Acronym
            acronym = tokens[i]
            if expansion:  # If there is an expansion before the acronym
                acronyms_with_expansions.append(f"{acronym}: {' '.join(expansion)}")
                is_followed_by_acronym_list.append(1)
                expansion = []  # reset expansion
                acronym = None  # reset acronym
            else:  # No expansion before acronym
                acronyms_with_expansions.append(acronym)
                is_followed_by_acronym_list.append(0)
                acronym = None  # reset acronym
        elif labels[i] == 3:  # Acronym without full form
            acronym = tokens[i]
            acronyms_with_expansions.append(acronym)
            is_followed_by_acronym_list.append(0)
            acronym = None  # reset acronym

    return acronyms_with_expansions, is_followed_by_acronym_list


# Apply the modified extraction function to the dataset without re-evaluating 'tokens' and 'labels' columns
data['acronyms_after_expansions'], data['is_followed_by_acronym'] = zip(*data.apply(lambda row: extract_acronyms_v2(row['tokens'], row['labels']), axis=1))

data.head()


Unnamed: 0,id,tokens,labels,acronyms_after_expansions,is_followed_by_acronym
0,TR-0,"[What, is, here, called, controlled, natural, ...","[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, ...",[CNL: controlled natural language],[1]
1,TR-1,"[In, this, paper, ,, we, introduce, CoQA, ,, a...","[4, 4, 4, 4, 4, 4, 1, 4, 4, 0, 2, 2, 4, 4, 4, ...",[CoQA],[0]
2,TR-2,"[in, :, Proceedings, of, SODA, 98, ,, Ninth, A...","[4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, ...","[SODA, ACM, SIAM, CA, USA]","[0, 0, 0, 0, 0]"
3,TR-3,"[But, is, the, existing, HPC, I, /, O, stack, ...","[4, 4, 4, 4, 1, 3, 3, 3, 4, 4, 4, 4, 4, 1, 4, ...","[HPC, I, /, O, NVM]","[0, 0, 0, 0, 0]"
4,TR-4,"[For, a, fixed, and, known, ,, we, apply, a, s...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 2, 4, 1, 4, ...",[SCA: successive convex approximation],[1]


In [18]:
def extract_acronyms_and_expansions_v2(row):
    tokens = row['tokens']
    labels = row['labels']

    i = 0
    acronyms_with_expansions = []
    is_expansion_followed = 0

    while i < len(tokens):
        if labels[i] == 1:  # Beginning of an acronym
            acronym = tokens[i]
            expansion_found = False

            # Look for possible expansion after the acronym
            for j in range(i+1, len(tokens)):
                if labels[j] == 0:  # Beginning of an expansion
                    expansion = []
                    while j < len(tokens) and (labels[j] == 0 or labels[j] == 2):  # Extract the full expansion
                        expansion.append(tokens[j])
                        j += 1
                    acronyms_with_expansions.append((acronym, ' '.join(expansion)))
                    is_expansion_followed = 1
                    expansion_found = True
                    break

            if not expansion_found:  # If no expansion is found for the acronym
                acronyms_with_expansions.append(acronym)

            i += 1
        elif labels[i] == 3:  # Acronym without expansion
            acronyms_with_expansions.append(tokens[i])
            i += 1
        else:
            i += 1

    return acronyms_with_expansions, is_expansion_followed

# Apply the updated function to extract acronyms and expansions and update the dataframe
data['acronyms_with_expansions'], data['is_expansion_followed'] = zip(*data.apply(extract_acronyms_and_expansions_v2, axis=1))

# Re-define the formatting function
def format_acronyms_with_expansions(acronyms_with_expansions):
    formatted_list = []
    for item in acronyms_with_expansions:
        if isinstance(item, tuple):  # if the item has an expansion
            formatted_list.append(f"{item[0]}: {item[1]}")
        else:  # if the item is just an acronym without expansion
            formatted_list.append(item)
    return formatted_list

# Apply the formatting function to the acronyms_with_expansions column
data['acronyms_with_expansions'] = data['acronyms_with_expansions'].apply(format_acronyms_with_expansions)

# Display the first few rows of the updated dataframe
data.head()


Unnamed: 0,id,tokens,labels,acronyms_after_expansions,is_followed_by_acronym,acronyms_with_expansions,is_expansion_followed
0,TR-0,"[What, is, here, called, controlled, natural, ...","[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, ...",[CNL: controlled natural language],[1],[CNL],0
1,TR-1,"[In, this, paper, ,, we, introduce, CoQA, ,, a...","[4, 4, 4, 4, 4, 4, 1, 4, 4, 0, 2, 2, 4, 4, 4, ...",[CoQA],[0],[CoQA: Conversational Question Answering],1
2,TR-2,"[in, :, Proceedings, of, SODA, 98, ,, Ninth, A...","[4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, ...","[SODA, ACM, SIAM, CA, USA]","[0, 0, 0, 0, 0]","[SODA, ACM, SIAM, CA, USA]",0
3,TR-3,"[But, is, the, existing, HPC, I, /, O, stack, ...","[4, 4, 4, 4, 1, 3, 3, 3, 4, 4, 4, 4, 4, 1, 4, ...","[HPC, I, /, O, NVM]","[0, 0, 0, 0, 0]","[HPC, I, /, O, NVM]",0
4,TR-4,"[For, a, fixed, and, known, ,, we, apply, a, s...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 2, 4, 1, 4, ...",[SCA: successive convex approximation],[1],[SCA],0


In [10]:
data.head()

Unnamed: 0,id,tokens,labels,acronyms_with_expansions,is_followed_by_acronym
0,TR-0,"[What, is, here, called, controlled, natural, ...","[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, ...",[CNL: controlled natural language],[1]
1,TR-1,"[In, this, paper, ,, we, introduce, CoQA, ,, a...","[4, 4, 4, 4, 4, 4, 1, 4, 4, 0, 2, 2, 4, 4, 4, ...",[CoQA],[0]
2,TR-2,"[in, :, Proceedings, of, SODA, 98, ,, Ninth, A...","[4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, ...","[SODA, ACM, SIAM, CA, USA]","[0, 0, 0, 0, 0]"
3,TR-3,"[But, is, the, existing, HPC, I, /, O, stack, ...","[4, 4, 4, 4, 1, 3, 3, 3, 4, 4, 4, 4, 4, 1, 4, ...","[HPC, I, /, O, NVM]","[0, 0, 0, 0, 0]"
4,TR-4,"[For, a, fixed, and, known, ,, we, apply, a, s...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 2, 4, 1, 4, ...",[SCA: successive convex approximation],[1]


In [8]:
data.to_csv('expansion_acronym.csv', index=False)

In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/combined_acronyms.csv')

# Display the first few rows of the dataset to understand its structure
df.head()


FileNotFoundError: ignored

In [None]:
import ast

def detokenize(token_list):
    """
    Convert a list of tokens into a normal sentence.
    """
    # Convert string representation of list to actual list
    tokens = ast.literal_eval(token_list)

    # Handle punctuation and special cases to join tokens
    sentence = ''.join([' ' + i if not i.startswith(("'", '"', "(", "[", "{", "“", "-", "’")) and i not in [",", ".", ":", ";", "!", "?", "]", "}", ")", "”", "’", "“"] else i for i in tokens]).strip()
    return sentence

# Apply detokenize function to the 'tokens' column
df['tokens'] = df['tokens'].apply(detokenize)

# Display the first few rows with detokenized sentences
df.head()


In [None]:
# combning 10 sentenceses

import re

def extract_acronyms(acronym_str):
    """Extract acronyms from a string formatted as a list."""
    # Using regular expressions to extract all elements within single quotes
    return re.findall(r"'(.*?)'", acronym_str)

def combine_rows_alternative(data):
    combined_data = []
    for i in range(0, len(data), 10):
        combined_text = ' '.join(data['tokens'][i:i+10])
        combined_acronyms = []
        for acronyms in data['acronyms_with_expansions'][i:i+10]:
            combined_acronyms.extend(extract_acronyms(acronyms))
        combined_data.append([combined_text, combined_acronyms])
    return pd.DataFrame(combined_data, columns=['Combined Tokens', 'Combined Acronyms'])

# Apply the alternative function to the dataset
combined_data_alternative = combine_rows_alternative(df)

combined_data_alternative.head()


In [None]:
combined_data_alternative.to_csv('falcon_full_data1.csv', index=False)

In [None]:
# Add the specified text to the beginning of each sentence in the "Combined Tokens" column
prefix_text = "Find both the full acronym with its expansion and, if unavailable, the acronym itself in the following sentence, The recognition should be case-insensitive: "
combined_data_alternative['Combined Tokens'] = prefix_text + '"' + combined_data_alternative['Combined Tokens'] + '"'

# Display the updated dataset
combined_data_alternative.head()


In [None]:
combined_data_alternative.to_csv('falcon_full_data2.csv', index=False)