In [1]:
import spacy

# Load SpaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Input sentence
sentence = "The professor, who was extremely knowledgeable, provided a thorough lecture on text simplification."

# Process the sentence
doc = nlp(sentence)

# Print tokens, parts of speech, and dependencies
for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dependency: {token.dep_}")

Token: The, POS: DET, Dependency: det
Token: professor, POS: NOUN, Dependency: nsubj
Token: ,, POS: PUNCT, Dependency: punct
Token: who, POS: PRON, Dependency: nsubj
Token: was, POS: AUX, Dependency: relcl
Token: extremely, POS: ADV, Dependency: advmod
Token: knowledgeable, POS: ADJ, Dependency: acomp
Token: ,, POS: PUNCT, Dependency: punct
Token: provided, POS: VERB, Dependency: ROOT
Token: a, POS: DET, Dependency: det
Token: thorough, POS: ADJ, Dependency: amod
Token: lecture, POS: NOUN, Dependency: dobj
Token: on, POS: ADP, Dependency: prep
Token: text, POS: NOUN, Dependency: compound
Token: simplification, POS: NOUN, Dependency: pobj
Token: ., POS: PUNCT, Dependency: punct


In [12]:
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
# Function to get a simpler synonym using WordNet
def get_simpler_synonym(word):
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word  # Return original word if no synonyms found
    
    # Choose a synonym with a high frequency (more common words are generally simpler)
    for syn in synonyms:
        for lemma in syn.lemmas():
            if lemma.name() != word:  # Avoid identical synonyms
                return lemma.name().replace("_", " ")
    
    return word

# Test with a sample sentence
parg = "One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes. The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals. Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs. MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome. Over 50 of known microRNAs are located within introns of coding genes. Given that currently about half of the genes in mouse have been knocked out, we investigated the possibility that intronic microRNAs may have been coincidentally deleted or disrupted in some of these mouse models. We searched published murine knockout studies and gene trap embryonic stem cell line databases for cases where a microRNA was located within or near the manipulated genomic loci, finding almost 200 cases where microRNA expression may have been disrupted along with another gene. Our results draw attention to the need for careful planning in future knockout studies to minimize the unintentional disruption of microRNAs. These data also raise the possibility that many knockout studies may need to be reexamined to determine if loss of a microRNA contributes to the phenotypic consequences attributed to loss of a proteinencoding gene. In the mouse, stable disruption of a gene is typically accomplished using gene trap mutagenesis or targeted homologous recombination. We wish to communicate the overlooked possibility of unintentionally disrupting microRNA (miRNA) genes along with a targeted gene. Because miRNAs play key roles in many cellular processes, the unintended ablation of these species may have significant consequences that complicate the interpretation of gene knockout studies. Given that many miRNAs are located within introns of longer coding transcripts, we reasoned that a gene trap disrupting a host gene could also alter miRNA expression in one of two ways. The trapping cassette could either ablate miRNA expression with a terminal polyadenylation sequence or overexpress an miRNA via an internal promoter. To determine the potential extent of these unintended changes in miRNA expression, we compared the genomic position all mouse gene traps listed in the International Gene Trap Consortium. The boundaries of the deleted loci were bioinformatically verified for each study. Our analysis of the IGTC database revealed 98 annotated or candidate miRNAs potentially misregulated in 420 gene trap cell lines , there were also numerous studies describing the deletion of regions immediately upstream , or in the promoter of the host gene (4 cases). MiRNAs have been shown to be transcribed in conjunction with a host transcript or from an independent promoter. Therefore, the disruption of host promoters or of regions adjacent to miRNAs may compromise promoter andor enhancer sites for these miRNAs. While 71 of the studies in our analysis were published prior to the expansion of the miRNA field in 2002, the fact that 90 were published since may indicate that miRNAs in targeted loci continue to be overlooked. To avoid inadvertent doubleknockout scenarios, we wish to alert investigators to consider noncoding elements in the locus to be deleted. Because not all noncoding elements have been annotated, it may be preferable to employ methods that minimize the deletion of endogenous DNA. We also wish to raise the interesting possibility that a number of studies may need to be reevaluated to dissociate the consequences of ablating an miRNA from the consequences of ablating the targeted gene."
complex_sentences = sent_tokenize(parg)

# Simplify each word in each sentence
simplified_sentence = []
for sentence in complex_sentences:
    for word in word_tokenize(sentence):
        simplified_word = get_simpler_synonym(word)
        simplified_sentence.append(simplified_word)

print("Original sentence:", parg)
print("Simplified sentence:", " ".join(simplified_sentence))
print(len(parg))
print(len(" ".join(simplified_sentence)))

Original sentence: One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes. The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals. Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs. MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome. Over 50 of known microRNAs are located within introns of coding genes. Given that currently about half of the genes in mouse have been knocked out, we investigated the possibility that intronic microRNAs may have been coincidentally deleted or disrupted in so

In [16]:
import re

# Function to split a complex sentence into simpler sentences
def split_complex_sentence(sentence):
    # Split based on commas or conjunctions (simple rule-based approach)
    parts = re.split(r',| and | but | which | who ', sentence)
    return [part.strip() + "." for part in parts if part]

# Test with a complex sentence
complex_sentence = "The professor, who was extremely knowledgeable, provided a thorough lecture on text simplification, which was very helpful."

simpler_sentences = split_complex_sentence(parg)
print("Simpler Sentences:")
for sent in simpler_sentences:
    print(sent)

Simpler Sentences:
One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes. The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi.
Evans.
Smithies for their pioneering work in targeted recombination mutagenesis in mammals. Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs. MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome. Over 50 of known microRNAs are located within introns of coding genes. Given that currently about half of the genes in mouse have been knocked out.
we investigated the possibility that intronic microRNAs may have been coincidentally deleted or disrupted in some o

In [19]:
import textstat

# Simplified sentence
simplified_text = "The professor was knowledgeable. He gave a lecture on text simplification."

# Calculate Flesch-Kincaid readability score
readability_score = textstat.flesch_kincaid_grade(simplified_text)
print(f"Readability score (Flesch-Kincaid grade level): {readability_score}")


Readability score (Flesch-Kincaid grade level): 7.8


In [24]:
from transformers import pipeline
from nltk.corpus import wordnet

# Load a pre-trained fill-mask model (e.g., BERT)
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Function to generate simpler synonyms using BERT and WordNet
def generate_simplifications(sentence, target_word):
    # Replace the target word with BERT's mask token
    masked_sentence = sentence.replace(target_word, "[MASK]")
    
    # Get predictions for the masked word
    predictions = fill_mask(masked_sentence)
    
    # Filter predictions to ensure they're simpler synonyms
    simpler_words = []
    for pred in predictions:
        predicted_word = pred["token_str"]
        # Check if the predicted word is simpler using WordNet frequency
        if wordnet.synsets(predicted_word) and len(predicted_word) < len(target_word):
            simpler_words.append(predicted_word)
    
    return simpler_words

# Example sentence and target word
sentence = "The professor delivered an elaborate presentation on the subject."
target_word = "professor"

# Get simplifications
simplifications = generate_simplifications(sentence, target_word)
print(f"Original word: {target_word}")
print(f"Suggested simplifications: {simplifications}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Original word: professor
Suggested simplifications: ['king', 'governor', 'speaker', 'queen']


In [39]:
import spacy
import textstat
import nltk
from nltk.corpus import wordnet
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from wordfreq import word_frequency

def is_complex_word(word, threshold=0.0001):
    """
    Determines if a word is complex based on its frequency.
    
    Args:
        word (str): The word to check.
        threshold (float): Minimum frequency threshold. Lower = rarer = complex.
    
    Returns:
        bool: True if the word is complex, False otherwise.
    """
    word = word.lower()  # Ensure case-insensitivity
    frequency = word_frequency(word, 'en')  # Get word frequency
    return frequency < threshold  # Complex if frequency is below threshold

# Load NLP and similarity models
nlp = spacy.load("en_core_web_sm")
fill_mask = pipeline("fill-mask", model="bert-base-uncased")
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Function to calculate semantic similarity
def semantic_similarity(original, simplified):
    embeddings = sentence_model.encode([original, simplified], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return similarity.item()

# Function to simplify words while preserving meaning
def simplify_word(word, context):
    masked_sentence = context.replace(word, "[MASK]")
    predictions = fill_mask(masked_sentence)
    
    
    try:
        # Get predictions for the masked word
        predictions = fill_mask(masked_sentence)
        
        # Ensure predictions are in the expected list of dictionaries format
        if not isinstance(predictions, list):
            return word  # Return original word if predictions are not as expected
        
        # Iterate through predictions to find simpler synonyms
        for pred in predictions:
            simplified_word = pred["token_str"]
            if wordnet.synsets(simplified_word) and len(simplified_word) < len(word):
                # Check semantic similarity to preserve meaning
                similarity = semantic_similarity(context, context.replace(word, simplified_word))
                if similarity > 0.9:  # Threshold for meaning preservation
                    return simplified_word
    except Exception as e:
        print(f"Error processing word '{word}' in context: {e}")
        
    return word

# Function to split sentences
def split_sentence(sentence):
    doc = nlp(sentence)
    return [sent.text for sent in doc.sents]

# Function to simplify an article
def simplify_article(article):
    simplified_article = []
    doc = nlp(article)
    
    for sentence in doc.sents:
        print(sentence)
        words = sentence.text.split()
        simplified_words = [simplify_word(word, sentence.text) if is_complex_word(word) else word for word in words]
        simplified_sentence = " ".join(simplified_words)
        simplified_sentences = split_sentence(simplified_sentence)
        simplified_article.extend(simplified_sentences)
    
    return " ".join(simplified_article)

# Example article
article = """
The professor delivered an elaborate and detailed lecture on text simplification, 
which was extremely beneficial for the students. They found the topic intriguing 
and intellectually stimulating, although the complexity was challenging.
"""

# Simplify the article
simplified_output = simplify_article(parg)
print("Simplified Article:\n", simplified_output)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes.
Error processing word 'gene' in context: list indices must be integers or slices, not str
Error processing word 'gene' in context: list indices must be integers or slices, not str
Error processing word 'gene' in context: list indices must be integers or slices, not str
The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals.
Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs.
MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome.
Over 50 

In [None]:
from datasets import load_dataset

ds1 = load_dataset("bogdancazan/wikilarge-text-simplification")
ds2 = load_dataset("rahular/simple-wikipedia")
ds3 = load_dataset("dongqi-me/SciNews")


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Tokenizing function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize the dataset (Assume 'text' is the field in your dataset)
tokenized_datasets = ds2.map(tokenize_function, batched=True)

# Prepare for training
train_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(1000)])  # Example 1000 samples
val_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(1000, 1200)])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer
)

# Train the model
trainer.train()


In [None]:
def simplify_text(text):
    # Encode input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)

    # Generate simplified text
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_length=150, num_beams=4, early_stopping=True)

    # Decode and return the simplified text
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return simplified_text

# Example usage:
scientific_text = """Scientists have recently discovered the existence of gravitational waves, which are ripples in spacetime...
    These waves are produced by accelerating massive objects such as black holes or neutron stars. The detection of these waves has revolutionized
    our understanding of the universe."""

simplified_text = simplify_text(scientific_text)
print(simplified_text)


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""
)