In [1]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import transformers

import warnings
warnings.filterwarnings('ignore')

#### Please carefully runs the cell to preprocess the datasets

In [2]:
def set_seed(seed):
    """
    Sets seed for all relevant libraries
    Args:
        seed (int): seed value for all modules
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
set_seed(10)

## Loading the Pre-trained  Model 

In [4]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel, transformers.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# for other model
model_class, tokenizer_class, pretrained_weights = (transformers.AutoModelForSeq2SeqLM,transformers.AutoTokenizer, "humarin/chatgpt_paraphraser_on_T5_base")

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [17]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [18]:
# Function to remove the "_versions" suffix from the filename
def remove_versions_suffix(filename):
    return filename.replace('_versions', '')

In [29]:
# def generate_scenarios(path1, output_directory):

#     os.makedirs(output_directory, exist_ok=True)

#     files1 = os.listdir(path1) 
    
#     csv_files1 = [file for file in files1 if file.endswith('.csv')]
    
#     for file1 in csv_files1:
#         df = pd.read_csv(os.path.join(path1,file1))

#         scenarios_list = df['Scenario'].to_numpy()
#         generated_scenarios = []
#         for scenario in scenarios_list:
#             generated_scenarios += paraphrase(scenario, num_return_sequences=10, max_length=512, num_beams=20, num_beam_groups=20)
        
#         df_generated = pd.DataFrame(generated_scenarios,  columns=['Scenario'])
#         df_generated.to_csv(os.path.join(output_directory,remove_versions_suffix(file1)),index = False)


In [19]:
def generate_scenarios(path1, output_directory, train_size = 32):
    
    os.makedirs(output_directory, exist_ok=True)

    files1 = os.listdir(path1) 
    
    csv_files1 = [file for file in files1 if file.endswith('.csv')]
    
    for file1 in csv_files1:
        df = pd.read_csv(os.path.join(path1,file1))

        scenarios_list = df['Scenario'].to_numpy()
        generated_scenarios = []
        for scenario in scenarios_list[:train_size]:
            generated_scenarios += paraphrase(scenario, num_return_sequences=10, max_length=512, num_beams=20, num_beam_groups=20)
            # generated_scenarios +=[scenario] 
        df_generated = pd.DataFrame(generated_scenarios,  columns=['Scenario'])
        df_generated.to_csv(os.path.join(output_directory,remove_versions_suffix(file1)),index = False)

In [20]:
scenarios_versions_path = "./scenarios_versions_csv_format"

scenarios_generated= "./scenarios_generated/paragraph_wise_2"

test = './experiment/test/'

generate_scenarios(scenarios_versions_path, scenarios_generated, train_size = 28)



In [9]:
df = pd.read_csv("./scenarios_versions_csv_format/scenario1.csv")

In [12]:
df['Scenario'].to_numpy()[:1]

array(['According to satellite data, an enemy ship is detected about 250 miles southwest of Chennai. It needs to be intercepted and interrogated. If the adversary acts aggressively, the use of SSMs may be required. The mission needs to be carried out within 36 hours. The fleet has 18 hours to deploy an appropriate vessel. The vessel should maintain a speed of 22 knots and have an endurance of no less than 9 days (including necessary supplies and fuel).'],
      dtype=object)

In [13]:
paraphrase_paragraph = paraphrase(df['Scenario'].to_numpy()[:1], num_return_sequences=10, max_length=512, num_beams=10, num_beam_groups=10)

In [14]:
paraphrase_paragraph

['According to satellite data, an adversary vessel is detected approximately 250 miles southwest of Chennai and requires interceptions and interrogations. If the enemy acts aggressively, SSMs may be deployed. The mission must be completed within 36 hours, and the fleet has 18 hours to deploy an appropriate vessel with a speed of 22 knots and endurance of at least 9 days (including fuel).',
 'Satellite data indicates that an adversary ship is located approximately 250 miles southwest of Chennai and requires interceptions and interrogations. If the enemy acts aggressively, SSMs may be deployed. The mission must be completed within 36 hours, and the fleet has 18 hours to deploy an appropriate vessel with a speed of 22 knots and endurance of at least 9 days (including fuel).',
 'An enemy vessel has been detected approximately 250 miles southwest of Chennai, according to satellite data. To intercept and question the vessel, SSMs may be necessary if the enemy acts aggressively; the mission s

In [22]:
paragraph = ["The fleet needs to schedule a plan for refit of ships and submarines present in the command in the upcoming annual refit conference. Two operational dry docks are available in the yard, no more than three ships/submarines should come for refit at a time. MoM are to be released with prioritizing ship/submarine within a week.  Repair workshops at the time of ship’s refit need to be fully operational and available with more than 80% manpower strength. Spares required during overhaul of equipment may arrive two months before the commencement of the refit plan."]

In [23]:
paraphrase_paragraph = paraphrase(paragraph, num_return_sequences=10, max_length=512, num_beams=10, num_beam_groups=10)

In [24]:
paraphrase_paragraph 

['The fleet must plan ahead with the upcoming annual refit conference for ships and submarines. This includes having two operational dry docks in the yard, as well as a requirement to minimize the number of ships/submarines arriving at once.',
 'A plan for refit of ships and submarines present in the command should be scheduled by the fleet at the annual refits conference, with two operational dry docks available. The majority of vessels should not arrive simultaneously, and the MoM should prioritize ship/submarine within a week. Repair workshops must be fully operational and available with over 80% manpower strength. Spares required during equipment overhauls may arrive two months before the start of the reffit plan.',
 'It is recommended that the fleet plan for refit of ships and submarines present in the command, which can be done during the annual refits conference. The yard should have two operational dry docks, and no more than three ships/submarines arriving at a time.',
 'To en

In [52]:
paraphrase_paragraph[10:] 

['This is to ensure the ship can be sailed during the annual maintenance period when various types of equipment are being used for "maintenance", as per guidelines set by The Post. To adhere to these new timelines, the quality of work done by workshop must not exceed 10% and the vessel should remain seaworthy within 48 hours of notice (for example, in case of an unplanned accident).',
 'He added: \'The vessel is mandated to sail during the annual maintenance period, with various instruments used for scheduled maintenance; in order that the ship be seaworthy within 48 hours of notice, [the] workshop has been instructed not "expects more than 10%" (invalidity) of work under the new promulgated timelines).',
 'To ensure compliance with the new timelines, all shipwork must take place within 48 hours. The ship is now expected to sail during an annual maintenance period when various equipment is being used to maintain it.',
 'As per the instructions, he has instructed that his ship should be

In [31]:
paraphrase_paragraph2 = paraphrase(paragraph, num_return_sequences=10, max_length=512, num_beams=20, num_beam_groups=20)

In [32]:
paraphrase_paragraph2

['The fleet must plan ahead for the refit of ships and submarines currently in the command, with two operational dry docks available.',
 'To ensure a smooth refit, the fleet must plan ahead for ships and submarines currently in the command, with two operational dry docks available.',
 "The next year's annual refit conference should determine the schedule for recharging ships and submarines in the fleet, with two dry docks available to ship/submarine crews. The MoM will be released within a week prior to the first scheduled reffit, and the repair workshop must be fully operational and available with over 80% manpower strength. Spares needed during equipment overhaul may arrive two months before the refits plan is finalized.",
 'Refreshment for ships and submarines currently in command should be scheduled by fleet during annual refit conference, with two operational dry docks available in the yard and up to three ship/submarine simultaneously.',
 'According to the upcoming annual refit c

## Metric to get the best generated 
- cosine similarity
- Jaccard similarity
- levenshtein_similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



# Calculate embedding using Distiil_BERT Model
embeddings = [1] # changr this with BERT embedding

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

print("Cosine Similarity Matrix:")
print(similarity_matrix)

In [37]:
from nltk.tokenize import word_tokenize
import nltk

# Ensure you have the required NLTK data files
nltk.download('punkt')

# Function to calculate Jaccard similarity
def jaccard_similarity(paragraph1, paragraph2):
    tokens1 = set(word_tokenize(paragraph1.lower()))
    tokens2 = set(word_tokenize(paragraph2.lower()))
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    similarity = len(intersection) / len(union)
    return similarity

# Calculate Jaccard similarity
similarity1 = jaccard_similarity(paragraph[0], paraphrase_paragraph[0])


print(f"Jaccard similarity between Paragraph 1 and 2: {similarity1}")

Jaccard similarity between Paragraph 1 and 2: 0.2753623188405797


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/balbirsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance
import nltk

nltk.download('punkt')

# Function to calculate word-by-word Levenshtein distance
def levenshtein_similarity(paragraph1, paragraph2):
    tokens1 = word_tokenize(paragraph1.lower())
    tokens2 = word_tokenize(paragraph2.lower())
    return edit_distance(tokens1, tokens2)

distance1 = levenshtein_similarity(paragraph[0], paraphrase_paragraph[0])

print(f"Levenshtein distance between Paragraph 1 and 2: {distance1}")

Levenshtein distance between Paragraph 1 and 2: 83


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/balbirsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
