## Importing the necessary model and the tokeniser from the pretrained saved models

In [9]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BartForConditionalGeneration, BartTokenizer,MBartForConditionalGeneration, MBart50TokenizerFast
import torch

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading the Roberta model for classification
roberta_model_name = "roberta_model"
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_name)
roberta_model_name2 = "roberta_model2"
roberta_model2 = RobertaForSequenceClassification.from_pretrained(roberta_model_name2)


# Loading the fine-tuned BART model for summarization
bart_model_name = "fine_tuned_bart_model_eng_to_engSummary"
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

# Loading the mBART model for translation to indic languages
translation_model_name = "mbart_eng_to_hi_bn_ta"
translation_model = MBartForConditionalGeneration.from_pretrained(translation_model_name)
translator_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Defining function which classifies the sentence as "bad" or "Neutral" using Roberta Classifer


In [10]:
def classify_sentence(sentence):
    inputs = roberta_tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    outputs = roberta_model(**inputs)
    logits = outputs.logits
    probabilities = logits.softmax(dim=1)
    label_id = torch.argmax(probabilities, dim=1).item()
    label = "bad" if label_id == 0 else "neutral"
    return label


def get_bad_sentence_confidence(sentence):
    inputs = roberta_tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    
    # Move inputs to the appropriate device (GPU if available)
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    with torch.no_grad():
        outputs = roberta_model2(**inputs)
    logits = outputs.logits
    
    probabilities = torch.softmax(logits, dim=1)
    bad_probability = probabilities[:, 0].item()
    
    return bad_probability

# N-gram approach for risk level scoring


In [11]:
df = pd.read_csv("TOSDR_labeled_with_summaries.csv")
text_data = df['Text'].tolist()

tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

feature_names = tfidf_vectorizer.get_feature_names_out()
top_ngrams_indices = tfidf_matrix.sum(axis=0).argsort()[0, -50:]


top_ngrams_indices_array = np.array(top_ngrams_indices)


tfidf_scores = tfidf_matrix.sum(axis=0)
top_ngrams_tfidf_scores = tfidf_scores[0, top_ngrams_indices]
sum_of_weights = top_ngrams_tfidf_scores.sum()



temp = 0
lst_features_names = feature_names.tolist()
# print(top_ngrams_indices_array[0])
top_feature_names = []
for i in top_ngrams_indices_array[0]:
    top_feature_names.append(lst_features_names[i])



def count_and_sum_matching_ngrams(input_sentence):
    input_tokens = input_sentence.split()
    bigram_count = trigram_count = 0
    total_weight = 0

    for i in range(len(input_tokens)):
        if i < len(input_tokens) - 1:
            bigram = ' '.join([input_tokens[i], input_tokens[i + 1]])
        if i < len(input_tokens) - 2:
            trigram = ' '.join([bigram, input_tokens[i + 2]])


        if i < len(input_tokens) - 1 and bigram in top_feature_names:
            bigram_count += 1
            total_weight += tfidf_matrix[:, tfidf_vectorizer.vocabulary_[bigram]].sum()
        if i < len(input_tokens) - 2 and trigram in top_feature_names:
            trigram_count += 1
            total_weight += tfidf_matrix[:, tfidf_vectorizer.vocabulary_[trigram]].sum()

    return total_weight

# Defining functions which generates english summaries for each Terms and Service using Sequence-to-Sequence model BART


In [12]:
def generate_summary(text, max_length=100):
    inputs = bart_tokenizer(text, max_length=max_length, return_tensors="pt", truncation=True)
    input_ids = inputs.input_ids.to(device)

    summary_ids = bart_model.generate(input_ids, max_length=150, min_length=10, length_penalty=2.0, num_beams=5, early_stopping=True)
    
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    return summary

def summarize_sentences(sentences):
    bad_sentences = [sentence for sentence in sentences if classify_sentence(sentence) == "bad"]
    if not bad_sentences:
        print("No Suspicious TOS found")
        return [], []
    summaries = []
    risk_level = []
    for bad_sentence in bad_sentences:
        risk_level.append(get_bad_sentence_confidence(bad_sentence)*0.75+ ((count_and_sum_matching_ngrams(bad_sentence))/sum_of_weights)*0.25)
    for bad_sentence in bad_sentences:
        summaries.append(generate_summary(bad_sentence))
    return summaries, risk_level



# Defining function which generates translation in Indic languages, Hindi, Bengali, Tamil, using facebook/mbart-large-50-many-to-many-mmt given input is summarised Terms of Service in english


In [13]:
def translate_text(to_summarise):
    resultant_summarisations = []
    for english_text in to_summarise:
        translator_tokenizer.src_lang = "en_XX"
        encoded_en = translator_tokenizer(english_text, return_tensors="pt")
        generated_tokens1 = translation_model.generate(
            **encoded_en,
            forced_bos_token_id=translator_tokenizer.lang_code_to_id["hi_IN"]
        )
        hindi_translation = translator_tokenizer.batch_decode(generated_tokens1, skip_special_tokens=True)

        generated_tokens2 = translation_model.generate(
            **encoded_en,
            forced_bos_token_id=translator_tokenizer.lang_code_to_id["bn_IN"]
        )
        bengali_translation = translator_tokenizer.batch_decode(generated_tokens2, skip_special_tokens=True)

        generated_tokens3 = translation_model.generate(
            **encoded_en,
            forced_bos_token_id=translator_tokenizer.lang_code_to_id["ta_IN"]
        )
        tamil_translation = translator_tokenizer.batch_decode(generated_tokens3, skip_special_tokens=True)

        resultant_summarisations.append([hindi_translation[0],bengali_translation[0],tamil_translation[0]])
    return resultant_summarisations

# Function call to execute the multi-model pipeline

In [14]:
def process_multi_sentence_input(sentences):
    summaries , risk_level = summarize_sentences(sentences)
    return summaries, risk_level


multi_sentence_input = [
    "Please note that if you request the erasure of your personal information: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.For example, if we suspend an Airbnb Account for fraud or safety reasons, we may retain certain information from that Airbnb Account to prevent that Member from opening a new Airbnb Account in the future.We may retain and use your personal information to the extent necessary to comply with our legal obligations.For example, Airbnb and Airbnb Payments may keep some of your information for tax, legal reporting and auditing obligations.",
    "When signing up for and using our service, we will collect the following information about you:Information that you provide when creating an Account to become a User.This information includes, but is not limited to:Information, such as your name and address, that uniquely identifies you as a natural or legal person",
    "We have the right to monitor, terminate, suspend, or delete any User Account at any time for any reason, or no reason.It is our policy not to comment on any reasons for termination and we have no obligation to provide you with a reason for termination.",
    "Our computer systems and third party hosting provider systems are currently based in the United States and may be located in other countries, so your personal data will be processed by us in the U.S.and other countries where data protection and privacy regulations may not offer the same level of protection as in other parts of the world, such as the European Union.If you create a user account with the Site as a visitor from outside the United States, by using the Site you agree to this Privacy Policy and you consent to the transfer of all such information to the United States, which may not offer an equivalent level of protection of that required in the European Union or certain other countries, and to the processing of that information as described in this Privacy Policy.",
    "We may suspend or terminate your rights to use the Parsec Properties (including your Account) at any time for any reason at our sole discretion, including for any use of the Parsec Properties in violation of these Terms.Upon termination of your rights under these Terms, your Account and right to access and use the Parsec Properties will terminate immediately.",
    "You agree that you will not remove obscure or alter any proprietary rights notices including copyright and trademark notices that may be affixed to or contained within the sdk.",
    "Please note that if you request the erasure of your personal information: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.For example, if we suspend an Airbnb Account for fraud or safety reasons, we may retain certain information from that Airbnb Account to prevent that Member from opening a new Airbnb Account in the future.We may retain and use your personal information to the extent necessary to comply with our legal obligations.For example, Airbnb and Airbnb Payments may keep some of your information for tax, legal reporting and auditing obligations."
    ]


print("Input TOS \n\n")
for i,sentence in enumerate(multi_sentence_input):
    print(i+1,": ",sentence,"\n")

Input TOS 


1 :  Please note that if you request the erasure of your personal information: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.For example, if we suspend an Airbnb Account for fraud or safety reasons, we may retain certain information from that Airbnb Account to prevent that Member from opening a new Airbnb Account in the future.We may retain and use your personal information to the extent necessary to comply with our legal obligations.For example, Airbnb and Airbnb Payments may keep some of your information for tax, legal reporting and auditing obligations. 

2 :  When signing up for and using our service, we will collect the following information about you:Information that you provide when creating an Account to become a User.This information includes, but is not limited to:Information, such as your name and address, that uniquely identifies you as a natural or

# Generated English summaries of suspicious Terms of Serives

In [15]:
output_summaries, risk_level = process_multi_sentence_input(multi_sentence_input)

print("Summarised TOS in English \n\n")
for i, (summarised_tos, risk) in enumerate(zip(output_summaries, risk_level)):
    print(f"{i+1}: {summarised_tos}")
    print(f"   Risk level: {risk}\n")


Summarised TOS in English 


1: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.
   Risk level: 0.2869627187799826

2: When signing up for and using our service, we collect the following information about you:
   Risk level: 0.23361974289011686

3:  can terminate, terminate, suspend, or delete any user account at any time for any reason, or no reason, with or without notice.
   Risk level: 0.3640334959475047

4: If you create a user account with the Site as a visitor from outside the United States, by using the Site you agree to this Privacy Policy and the Site's services, you give up your right to participate in class action lawsuits.
   Risk level: 0.3249689004556118

5: We may suspend or terminate your rights to use the Parsec Properties (including your Account) at any time for any reason at our sole discretion, including for violations of these Terms.
   Risk level: 0.379

# Generated suspicious Terms of Service summaries in Indic Languages

In [16]:
translations = translate_text(output_summaries)


for i,translation in enumerate(translations):
    print("Translation ",i+1,":")
    print("Hindi:", translation[0])
    print("Bengali:", translation[1])
    print("Tamil:", translation[2])
    print("\n")

Translation  1 :
Hindi: हम आप के कुछ व्यक्तिगत सूचनाओं को अपने वैध व्यापारिक हितों के लिए, जैसे धोखाधड़ी की पहचान और रोकथाम और सुरक्षा बढ़ाने के लिए आवश्यक रूप से सुरक्षित रख सकते हैं।
Bengali: আমরা কিছুক আপনার ব ্ যক ্ তিগত তথ ্ য যদি আমাদের সত ্ যিকার অর ্ থনীতির ক ্ ষেত ্ রে সংরক ্ ষণের দরকার হয়, যেমন চুরির সনাক ্ ত করা ও নিয়ন ্ ত ্ রণ করা এবং নিরাপত ্ তার উন ্ নয়ন ।
Tamil: fraud detection and prevention and enhancing safety போன்ற நமது சட்டபூர்வமான வர்த்தக நலன்களுக்காக தேவைப்படும் வகையில், நாங்கள் உங்களது சில தனிப்பட்ட தகவல்களைப் பாதுகாக்க முடியும்.


Translation  2 :
Hindi: हमारी सेवा के लिए साइन-अप करने और उपयोग करने पर, हम आप के बारे में निम्नलिखित जानकारी इकट्ठा करते हैंः
Bengali: যখন আমরা আমাদের পরিষেবার জন ্ য স ্ বাক ্ স করি এবং ব ্ যবহার করি, আমরা আপনাদের সম ্ পর ্ কে निम्न তথ ্ য সংগ ্ রহ করি:
Tamil: எங்களது சேவையை கையெழுத்திடுவதும், பயன்படுத்துவதும் போது, நாம் உங்களைப் பற்றிய பின்வரும் தகவல்கள் சேகரிக்கிறோம்:


Translation  3 :
Hindi: किसी भी कारण से या बिना किसी सूचना 