# Install libraries

In [None]:
!pip install tensorflow==1.13.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install keras==2.2.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-m166899c
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-m166899c


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data loader

In [None]:
# Maximum length of comment
max_len = 256
# Dimension of embedding vector
embedding_dim = 100
# Max feature
max_feature = 10000

In [None]:
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/NLP File/tsd_train.csv")

In [None]:
from ast import literal_eval

text_data = data['text'].values
spans = data['spans'].apply(literal_eval)
lbl = [1 if len(s) > 0 else 0 for s in spans]

In [None]:
# Token level 

from nltk.tokenize import TweetTokenizer
import numpy as np

tknzr2 = TweetTokenizer()

def custom_tokenizer(text_data):
    return tknzr2.tokenize(text_data)

def retrieve_word_from_span(lst_span, text):
    i = 0
    token = []
    a = 0

    word = []

    while (i < (len(lst_span) - 1)):
        if (lst_span[i] != (lst_span[i+1]-1)):
            token.append(lst_span[a:(i+1)])
            a = i + 1
        elif i == (len(lst_span) - 2):
            token.append(lst_span[a:i+2])

        i = i + 1

    for t in token:
        word.append(text[t[0]:(t[len(t)-1])+1])

    return word

def span_retrived(text_data, spans):
    token_labels = []

    for i in range(0, len(text_data)):
        token_labels.append(retrieve_word_from_span(spans[i], text_data[i]))
    
    return token_labels

def span_convert(text_data, spans):
    MAX_LEN = 0
    token_labels = []

    for i in range(0, len(text_data)):
        token_labels.append(retrieve_word_from_span(spans[i], text_data[i]))

    lst_seq = []
    for i in range(0, len(text_data)):
        # token = tknzr.tokenize(text_data[i])
        token = custom_tokenizer(text_data[i])
        if len(token) > MAX_LEN:
            MAX_LEN = len(token)
            
        seq = np.zeros(len(token), dtype=int)
        for j in range(0, len(token)):
            for t in token_labels[i]:
                # if token[j] in tknzr.tokenize(t):
                if token[j] in custom_tokenizer(t):
                    seq[j] = 1
        lst_seq.append(seq)     

    return (token_labels, lst_seq)

In [None]:
from copy import deepcopy

# convert data
data['token'], data['seq'] = span_convert(text_data, spans)

train = deepcopy(data)

# Word embedding

In [None]:
all_words = []
embeddings_dictionary = dict()

glove_file = open(r"/content/drive/MyDrive/NLP File/glove.6B.100d.txt", encoding="utf8")

for line in glove_file:
    
    records = line.split() 
    word = records[0]
    all_words.append(word)
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

In [None]:
words = all_words
num_words = len(words)

# Dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {w : i + 2 for i, w in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# Dictionary lable:index pair
idx2word = {i: w for w, i in word_to_index.items()}

In [None]:
# import json
# with open('/content/drive/MyDrive/Colab Notebooks/word2index.json', 'w') as fp:
#     json.dump(word_to_index, fp)

# with open('/content/drive/MyDrive/Colab Notebooks/idx2word.json', 'w') as fp:
#     json.dump(idx2word, fp)

In [None]:
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, index in word_to_index.items():
  if index > max_feature:
    continue
    
  embedding_vector = embeddings_dictionary.get(word)
  
  if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector
  
  else:
      embedding_matrix[index] = np.random.randn(embedding_dim)

In [None]:
y = train['seq']
x = train['text']

In [None]:
# train test
from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size = 0.1)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# from nltk.tokenize import word_tokenize

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.initializers import Constant
# from nltk.corpus import stopwords
# import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
tknzr2 = TweetTokenizer()
def custom_tokenizer(text_data):
    text_data = text_data.lower()
    return tknzr2.tokenize(text_data)

In [None]:
def encoding(X, y, isTest = True):
    sentences = []
    
    for t in X:
        sentences.append(custom_tokenizer(t))

    X = []
    for s in sentences:
        sent = []
        for w in s:
            try:
                w = w.lower()
                sent.append(word_to_index[w])
            except:
                sent.append(word_to_index["UNK"])
        X.append(sent)
           
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index["PAD"])

    if isTest:
        y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=word_to_index["PAD"])
        y = to_categorical(y, num_classes=2)
    else:
        y = None

    return (X,y)

In [None]:
x1, y1 = encoding(x_train, y_train)

In [None]:
x2, y2 = encoding(x_dev, y_dev)

In [None]:
print(custom_tokenizer(x_train[6194]))
print(x_train[6194])
print(x1[6194])
print(y[6194])

['you', 'must', 'be', 'so', 'dumb', 'that', 'it', 'bypasses', 'you', 'what', 'these', 'immigrants', 'want', '.', 'yes', ',', 'land', 'of', 'the', '"', 'free', '"', '.', 'all', 'the', 'freebies', 'from', 'the', 'government', 'they', 'can', 'get', ',', 'because', 'they', "don't", 'know', 'english', 'well', '&', 'need', 'an', 'interpreter', '&', 'still', 'pretend', 'not', 'to', 'understand', ',', 'they', 'get', 'benefits', 'that', 'you', '&', 'i', "aren't", 'entitled', 'to', 'unless', 'we', 'work', '.', 'go', 'through', 'tsa', 'at', 'the', 'airport', ',', 'they', 'have', 'the', 'average', 'guy', 'almost', 'strip', '&', 'take', 'their', 'belts', 'off', ',', 'almost', 'having', 'to', 'pull', 'down', 'their', 'pants', '.', 'people', 'from', 'other', 'countries', 'wearing', 'hijabs', ',', 'chdors', ',', 'burkas', ',', 'niqabs', ',', 'and', 'other', 'head', 'wraps', ',', 'never', 'have', 'to', 'take', 'them', 'off', '.']
You must be so dumb that it bypasses you what these immigrants want.  YES

In [None]:
from keras.layers import Layer
import keras.backend as K
# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)

    def call(self,x):
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [None]:
# BiLSTM - CRF 
from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Dropout, GlobalMaxPool1D, Input
from keras.models import Model
from keras_contrib.layers import CRF

import warnings
warnings.filterwarnings("ignore")

inp = Input(shape = (max_len,))

model = Embedding(input_dim=num_words+2,
                    output_dim=embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_len,
                    trainable=True)(inp)

model = Bidirectional(LSTM(units = max_len, return_sequences=True, recurrent_dropout=0.1))(model)

# attention_layer = attention()(model)

# x = GlobalMaxPool1D()(attention_layer) # reduce dimensionality

model = TimeDistributed(Dense(max_len, activation="relu"))(model)

# x = Dropout(0.1)(attention_layer)

# x = Dense(200, activation = "sigmoid")(x)

crf = CRF(2)  
out = crf(model)

model = Model(inp, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=['accuracy'])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 100)          40000200  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256, 512)          731136    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 256, 256)          131328    
_________________________________________________________________
crf_1 (CRF)                  (None, 256, 2)            522       
Total params: 40,863,186
Trainable params: 40,863,186
Non-trainable params: 0
_________________________________________________________________


## Prediction

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-er7g9oln
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-er7g9oln


In [None]:
def test_prediction(sentence_input):
  
  sentence_input = custom_tokenizer(sentence_input)
  sent = []
  for w in sentence_input:
      try:
          w = w.lower()
          sent.append(word_to_index[w])
      except:
          sent.append(word_to_index["UNK"])
  
  remaining_pads = max_len - len(sent)
  result = sent + [0]*remaining_pads
  
  test_vector = np.array(result).reshape(1,-1)
  y_pred = model.predict(test_vector)
  y_pred = np.argmax(y_pred, axis=-1)
  
  original_test = [idx2word[i] for i in test_vector[0]]
  
  yy_pred = []
  for i in range(0, len(original_test)):
      if y_pred[0][i] == 1:
        yy_pred.append(original_test[i])
  
  return yy_pred

In [None]:
from keras.models import load_model
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy

from nltk.tokenize import TweetTokenizer
import json
import ast

with open("/content/drive/MyDrive/Colab Notebooks/word2index.json", "r") as data:
    word_to_index = ast.literal_eval(data.read())

with open("/content/drive/MyDrive/Colab Notebooks/idx2word.json", "r") as data:
    idx2word = ast.literal_eval(data.read())
  
def custom_tokenizer(text_data):
    text_data = text_data.lower()
    return tknzr2.tokenize(text_data)

max_len = 256

model = load_model('/content/drive/MyDrive/Colab Notebooks/new-weights', custom_objects={'CRF':CRF,'crf_loss':crf_loss,'crf_accuracy':crf_accuracy})

def test_prediction(sentence_input):
  
  sentence_input = custom_tokenizer(sentence_input)
  sent = []
  for w in sentence_input:
      try:
          w = w.lower()
          sent.append(word_to_index[w])
      except:
          sent.append(word_to_index["UNK"])
  
  remaining_pads = max_len - len(sent)
  result = sent + [0]*remaining_pads
  
  test_vector = np.array(result).reshape(1,-1)
  y_pred = model.predict(test_vector)
  y_pred = np.argmax(y_pred, axis=-1)
  
  original_test = [idx2word[i] for i in test_vector[0]]
  
  yy_pred = []
  for i in range(0, len(original_test)):
      if y_pred[0][i] == 1:
        yy_pred.append(original_test[i])
  
  return yy_pred

In [None]:
test_prediction("You are so disgusting and stupid")

[]

In [None]:
test_prediction("Fucking asshole, please die")

[]

In [None]:
test_prediction("What the fuck are you doing you morons!")

[]

In [None]:
test_prediction("You stupid fuck")

[]

In [None]:
test_prediction("What the fuck is going on here")

[]

In [None]:
test_prediction("You black men are slaves to superior whites!!")

[]

In [None]:
test_prediction("You fucking moron")

[]

In [None]:
test_prediction("What the fuck are you doing you moron")

Pipeline

In [None]:
import pandas as pd
import numpy as np
!pip install stanza
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer
import re
!pip install bs4
from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
!pip install sparse_dot_topn 
import sparse_dot_topn.sparse_dot_topn as ct
!pip install contractions
import contractions
#!pip uninstall transformersy
!pip install --no-cache-dir transformers sentencepiece
from transformers import pipeline
import re
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import itertools
import copy
from transformers import AutoTokenizer, AutoModelForMaskedLM
!pip install -U sentence-transformers
!pip install detoxify
!pip install --upgrade git+https://github.com/flairNLP/flair.git
import pandas as pd
from detoxify import Detoxify
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from flair.embeddings import FlairEmbeddings


In [None]:
with open('bad_word.txt') as f:
    bad_words =[line.rstrip("\n") for line in f.readlines()]

In [None]:
df = pd.read_csv('labeled_comments.csv',  error_bad_lines=False)
df.head()

In [None]:
label =[]
for i in range(len(df)):
  if df.toxicity_score[i]>0.5:
    label.append(1)
  else:
    label.append(0)
df['label'] = label
df.head()

In [None]:
# get only sentence 1 for good sentences
for i in range(len(df)):
  if df.label[i]==0:
    df['comments'][i] = df['comments'][i].split(".")[0]

In [None]:
def text_prepare(text):
    #removing urls
    text = re.sub(r'http\S+', '', text)
    text = BeautifulSoup(text).get_text()
    #removing emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = contractions.fix(text)
    #removing email
    text =re.sub(r'[A-Za-z0-9]@[A-Za-z]\.?[A-Za-z0-9]*', "", text)
    #Keeping only alphabets
    regex = re.compile('[^a-zA-Z ]')
    text = regex.sub(' ',str(text))
    text = re.sub(' +', ' ', text)
    text = text.lower()
    # delete stopwords from text
    # text = ' '.join([word for word in text.split()]) 
    # text = text.strip()
    return text

In [None]:
# Data cleaning
df['comments']= df['comments'].apply(lambda x:text_prepare(x))

In [None]:
# getting good sentences
good_sentences = df[df.label==0][:2500].reset_index().drop(['index', 'toxicity_classification', 'toxicity_score', 'label'], axis=1)

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')

In [None]:
app = MUDESApp("en-large", use_cuda=False)
def replace_BW(sent):
  spans = app.predict_toxic_spans(sent, spans=True)
  for i in spans[::-1]:
    sent = sent[:i[0]]+ 'BW' +sent[i[1]+1:]
  return sent

In [None]:
def retrive_pos_tagging_good(balanced_dataset):
  pos = []
  pos_dict = []
  for i in range(len(balanced_dataset)):
    doc = nlp(balanced_dataset.comments[i])
    temp = []
    temp_dict = {}
    for sent in doc.sentences:
      for word in sent.words:
        temp.append(word.xpos)
        if word.xpos in temp_dict:
          temp_dict[word.xpos].append(word.text)
        else:
          temp_dict[word.xpos] = [word.text]
    pos.append(temp)
    pos_dict.append(temp_dict)
  return pos, pos_dict

In [None]:
def retrive_pos_tagging_bad(balanced_dataset):
  temp_comments = []
  for i in range(len(balanced_dataset)):
    sent = balanced_dataset.comments[i]
    sent = replace_BW(sent)
    temp_comments.append(sent)

  balanced_dataset['new_comments'] = temp_comments
  pos = []
  pos_dict = []
  for i in range(len(balanced_dataset)):
    doc = nlp(balanced_dataset.new_comments[i])
    temp = []
    temp_dict = {}
    for sent in doc.sentences:
      for word in sent.words:
        if word.text == 'BW':
          temp.append('BW')
        else:
          temp.append(word.xpos)
          if word.xpos in temp_dict:
            temp_dict[word.xpos].append(word.text)
          else:
            temp_dict[word.xpos] = [word.text]
    pos.append(temp)
    pos_dict.append(temp_dict)
  return pos, pos_dict

In [None]:
pos, pos_dict = retrive_pos_tagging_good(good_sentences)
good_sentences['pos'] = pos
good_sentences['pos'] = good_sentences['pos'].str.join(" ")
good_sentences['pos_dict'] = pos_dict

In [None]:
def get_tfidf_vectorizer(good_sentences, bad_sentences):
  balanced_dataset = pd.concat([good_sentences, bad_sentences], axis=0, ignore_index=True)
  tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df=0.9, min_df=5, token_pattern='(\S+)')
  tf_idf_matrix = tfidf_vectorizer.fit(balanced_dataset['pos'])
  tf_idf_matrix_good = tfidf_vectorizer.transform(good_sentences['pos'])
  return tfidf_vectorizer, tf_idf_matrix_good

In [None]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    matches = csr_matrix((data,indices,indptr),shape=(M,N)).toarray()
    ans =[]
    for i in range(len(matches)):
      a = matches[i]
      ans.append(np.argwhere(a>0))
    return ans

In [None]:
def find_similar_good_sentences(bad_sentence, good_sentences, ans):
  similar_df= {'bad':[], 'good':[], 'bad_pos':[], 'good_pos':[], 'bad_pos_map':[], 'good_pos_map':[]}  
  similar_df['bad'].append(bad_sentence.comments[0])
  similar_df['bad_pos'].append(bad_sentence.pos[0])
  similar_df['bad_pos_map'].append(bad_sentence.pos_dict[0])
  similar_good = []
  similar_pos =[]
  similar_pos_map = []
  for i in ans:
    similar_good.append(good_sentences.comments[i[0]])
    similar_pos.append(good_sentences.pos.iloc[i[0]])
    similar_pos_map.append(good_sentences.pos_dict[i[0]])
  similar_df['good'].append(similar_good)
  similar_df['good_pos'].append(similar_pos)
  similar_df['good_pos_map'].append(similar_pos_map)
  return similar_df

In [None]:
def make_assignments(list_1,list_2):
  unique_combinations = []
  # permut = itertools.permutations(list_1, len(list_2))
  # print(permut)

  if len(list_1)<len(list_2):
      permut = itertools.permutations(list_2, len(list_1))
      for comb in permut:
        zipped = zip(list_1,comb)
        unique_combinations.append(list(zipped))

  else:
    permut = itertools.permutations(list_1, len(list_2))
    for comb in permut:
        zipped = zip(comb,list_2)
        unique_combinations.append(list(zipped))
  
  return unique_combinations  
def generate(xi, pi, pi_dash, bad_words, pi_map, pi_dash_map):
  ti = set(pi)
  ti_dash = set(pi_dash)
  t_shared = ti.intersection(ti_dash)
  c0 = ["<mask>"]*len(pi_dash)
  ci = [c0]
  for inx, tk in enumerate(t_shared):
    if tk=='BW':
      continue
    wk = set(pi_map[tk])
    sk = [i for i, x in enumerate(pi_dash) if x == tk]
    assignments = make_assignments(wk, sk)
    if len(assignments)==0:
      continue
    new_ci = []
    for assignment in assignments:
      for prev_sent in ci:
        for word, indx in assignment:
          temp = copy.deepcopy(prev_sent)
          temp[indx]= word
        new_ci.append(temp)
    ci = new_ci
  return ci

In [None]:
def fill_mask_word_roberta(input_string, classifier_roberta, puncs):
    if input_string.split(" ").count('<mask>') ==0:
       return input_string.split("[SEP]",1)[1].strip() 
    result = classifier_roberta(input_string)
    ans = input_string
    if ans.split(" ").count('<mask>') ==1:
       result = [result]
    for each_mask in result:
        for i in range(len(each_mask)):
            filler_word = each_mask[i]['token_str'].strip()
            if filler_word.lower() not in bad_words and filler_word not in puncs:
                ans = re.sub('<mask>', filler_word, ans, count=1)
                break
            if i == len(each_mask) - 1:
                ans = re.sub('<mask>', '', ans, count=1)
                break 
    return ans.split("[SEP]",1)[1].strip()


In [None]:
def remove_adjacent(seq): # works on any sequence, not just on numbers
  i = 1
  n = len(seq)
  while i < n: # avoid calling len(seq) each time around
    if seq[i] == seq[i-1]:
      del seq[i]
      # value returned by seq.pop(i) is ignored; slower than del seq[i]
      n -= 1
    else:
      i += 1

In [None]:
def embeddings_cosine_toxicity_perplexity(toxic_model, embedding_model, perplex_model, original, created_list):
  
  # Here we shall first make the embeddings of the sentences
  original_sent_embeddings = embedding_model.encode(original)
  created_sent_embeddings = embedding_model.encode(created_list)
  my_selection = {"original_sentence":original,"generated_sentences":created_list, "cosine_score":[], "non-toxicity_score":[], "perplexity_score":[]}
  cosine_calc = []
  toxic_calc = []
  perplex_calc = []

  # Now we will calclate the cosine with the original sentences
  for indx, i in enumerate(created_sent_embeddings):
    toxic_calc.append(1-(toxic_model.predict(created_list[indx])['toxicity']))
    perplex_calc.append(perplex_model.calculate_perplexity(created_list[indx])*-1)
    cosine_calc.append(cosine_similarity(original_sent_embeddings.reshape(1,-1), i.reshape(1,-1)).astype(str)[0][0])
  my_selection.update({"cosine_score": cosine_calc})
  my_selection.update({"non-toxicity_score":toxic_calc})
  my_selection.update({"perplexity_score":perplex_calc})
  
  return my_selection