In [None]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
val = pd.read_csv("valid.csv")

In [None]:
import nltk
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Lexical Substitution using NLTK:

In [None]:
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag

def simplify_text_lexical(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    simplified_tokens = []
    for token, pos in tagged_tokens:
        if pos.startswith('NN') or pos.startswith('VB') or pos.startswith('JJ'):
            # Get the first synonym from WordNet
            synonyms = wordnet.synsets(token)
            if synonyms:
                simplified_tokens.append(synonyms[0].lemmas()[0].name())
            else:
                simplified_tokens.append(token)
        else:
            simplified_tokens.append(token)

    simplified_text = ' '.join(simplified_tokens)
    return simplified_text


test['predicted'] = test['0'].apply( lambda x : simplify_text_lexical(x))

## Sentence Splitting:

In [None]:
import re

def split_sentences(text):
    # Use a simple regex to split sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences


test['predicted'] = test['predicted'].apply( lambda x : split_sentences(x)  )

In [None]:
for i in range(0,len(test)) :
  t = test.iloc[i,-1]
  temp = ""
  for j in t :
    temp = temp + j + ". "
  test.iloc[i,-1] = temp

## Paraphrasing:

In [None]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import random

def paraphrase_sentence(sentence):
    tokens = word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)

    lemmatizer = WordNetLemmatizer()
    paraphrased_tokens = []

    for token, pos in tagged_tokens:
        if pos.startswith('NN') or pos.startswith('VB') or pos.startswith('JJ'):
            synonyms = wordnet.synsets(token)
            if synonyms:
                # Choose a random synonym
                paraphrased_tokens.append(synonyms[0].lemmas()[0].name())
            else:
                paraphrased_tokens.append(lemmatizer.lemmatize(token))
        else:
            paraphrased_tokens.append(token)

    paraphrased_sentence = ' '.join(paraphrased_tokens)
    return paraphrased_sentence



test['predicted'] = test['predicted'].apply( lambda x : paraphrase_sentence(x)  )

## Removing Redundancy:

In [None]:
def remove_redundancy(text):
    # Remove repeated words
    words = text.split()
    unique_words = list(set(words))
    simplified_text = ' '.join(unique_words)
    return simplified_text

# Example usage:
# original_text = "The cat cat jumped over the lazy lazy dog."
# simplified_text = remove_redundancy(original_text)
# print(simplified_text)

test['predicted'] = test['predicted'].apply( lambda x : remove_redundancy(x)  )

## Active Voice:

In [None]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def convert_to_active_voice(sentence):
    tokens = word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)

    lemmatizer = WordNetLemmatizer()
    active_tokens = []

    for token, pos in tagged_tokens:
        if pos.startswith('V'):
            # Convert to base form (lemma) for verbs
            active_tokens.append(lemmatizer.lemmatize(token, pos='v'))
        else:
            active_tokens.append(token)

    active_sentence = ' '.join(active_tokens)
    return active_sentence



test['predicted'] = test['predicted'].apply( lambda x : convert_to_active_voice(x)  )

## Sentence Simplification:

In [None]:
from nltk import sent_tokenize

def simplify_sentence_structure(text):
    sentences = sent_tokenize(text)
    simplified_sentences = []

    for sentence in sentences:
        # Simplify sentence structure (e.g., remove subordinate clauses)
        simplified_sentences.append(sentence)

    simplified_text = ' '.join(simplified_sentences)
    return simplified_text



test['predicted'] = test['predicted'].apply( lambda x : simplify_sentence_structure(x)  )

## results

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0-

In [None]:
test.head(2)

Unnamed: 0.1,Unnamed: 0,0,1,predicted
0,0,One side of the armed conflicts is composed ma...,One side of the armed conflicts is made of Sud...,"militia One Abbala beryllium a and military , ..."
1,1,"Jeddah is the principal gateway to Mecca , Isl...","Jeddah is the main gateway to Mecca, Islam's h...","Muslim be Islam beryllium able to which , thei..."


In [None]:
import evaluate
predictions = test.iloc[:,3]
references = test.iloc[:,2]
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.012491474773396492, 'precisions': [0.5353764661369655, 0.024966974900924702, 0.0024961863819165163, 0.0007297139521307647], 'brevity_penalty': 1.0, 'length_ratio': 1.0253459200827622, 'translation_length': 7929, 'reference_length': 7733}


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=25fd6fd8839110e2b73ec35365a67ec18d5aa548c36b7071bf417e4133118bfc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
predictions = test.iloc[:,3]
rouge = evaluate.load('rouge')
references = test.iloc[:,2]
results = rouge.compute(predictions=predictions,references=references)
print(results)

{'rouge1': 0.6038831496822792, 'rouge2': 0.04144462363848284, 'rougeL': 0.27670112041377415, 'rougeLsum': 0.27637069613107146}


In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
from evaluate import load
sari = evaluate.load("sari")
sources= test.iloc[:,1]
predictions= test.iloc[:,3]
references= [[i] for i in test.iloc[:,2] ]
sari_score = sari.compute(sources=sources, predictions=predictions, references=references)
sari_score

{'sari': 20.077839123915567}