<a href="https://colab.research.google.com/github/HaraldsU/VTP_LPD/blob/main/Val_teh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text masking

## Imports

In [1]:
!pip install stanza
!pip install -U sentence-transformers



In [2]:
!pip install line_profiler



In [3]:
%load_ext line_profiler

In [4]:
!wget "https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/stopwords.txt" -O 'stopwords.txt'

--2024-06-18 23:18:00--  https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681 [text/plain]
Saving to: ‘stopwords.txt’


2024-06-18 23:18:01 (44.5 MB/s) - ‘stopwords.txt’ saved [681/681]



In [5]:
import stanza
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)
from stanza.server import CoreNLPClient

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir



## Functions

In [6]:
def format_text(output):
  # print_help(output, 'Op')
  regexp = re.compile(r'[MSK\d*]')
  regexp2 = re.compile(r"'[A-Za-z]")
  final_output = []

  for word in range(len(output)):
    prev = output[word - 1]
    pprev = output[word - 2]
    cur = output[word]

    if not regexp.search(cur):
      if word == 0:
        cur = cur.capitalize()
      elif word > 1 and prev in ['.', '!', '?']:
        cur = cur.capitalize()

    if (cur in [".", ",", '!', '?', ':', ';'] and final_output) or regexp2.search(cur):
      final_output[-1] += cur
    elif cur == '-':
      final_output[-1] += cur
      final_output[-1] += output[word + 1]
      word += 1
    elif word == len(output) - 1 and cur not in ['.', '?', '!', '\'']:
      final_output[-1] += '.'
    else:
        final_output.append(cur)

  return ' '.join(final_output)

In [7]:
def spans_to_ranges(spans):
  msk_gr = []
  for span in spans:
    span1_start = span['span1']['start']
    span1_end = span['span1']['end']
    span2_start = span['span2']['start']
    span2_end = span['span2']['end']
    score = span['score']
    in_grp = False
    c = 0
    for gr in msk_gr:
      for rng in gr:
        if span1_start<=rng[1] and span1_end >= rng[0]:
          c+=1
        if span2_start<=rng[1] and span2_end >= rng[0]:
          c+=2
        if  c == 3 : break
      if    c == 0 :
        continue
      elif  c == 1 :
        gr.append([span2_start,span2_end,score])
        break
      elif  c == 2 :
        gr.append([span1_start,span1_end,score])
        break
      elif  c == 3 :
        break
    # print(span)
    # print(c, [span1_start,span1_end], [span2_start,span2_end])
    if c == 0: msk_gr.append([[span1_start,span1_end,score],[span2_start,span2_end,score]])

  # print('\n----------------------------------------------')
  # TODO: Check if sceanrio where ranges intersect exists, implement it
  combined = True
  while combined:
    combined=False
    for i in range(len(msk_gr)):
      for r_i in msk_gr[i]:
        for j in range(i+1,len(msk_gr)):
          for r_j in msk_gr[j]:
            if r_i[0]<=r_j[1] and r_i[1] >= r_j[0]:
              # print(r_i, r_j)
              combined = True
              for r_jj in msk_gr[j]:
                duplicate = None
                for r_ii in msk_gr[i]:
                  if r_ii[0]<=r_jj[1] and r_ii[1] >= r_jj[0]:
                    duplicate = r_ii
                    break
                if(duplicate):
                  r_ii == r_ii if r_ii[2]>r_jj[2] else r_jj
                else:
                  msk_gr[i].append(r_jj)
              msk_gr.pop(j)
            if(combined):break
          if(combined):break
        if(combined):break
      if(combined):break

  # print('----------------------------------------------')
  return msk_gr

In [8]:
def get_similar_spans(spans, cos_threshold, model):
  embedings =  model.encode([s["text"] for s in spans])
  similar_spans = []
  for i in range(len(spans)-1):
    emb = embedings[i]
    for j in range(i+1,len(spans)):
      if spans[j]['start'] <= spans[i]['end']:
        continue
      emb_chk = embedings[j]
      cos_score = cosine_similarity([emb], [emb_chk])
      if(cos_score>=cos_threshold):
        score = 1*cos_score + 0.15*(1-1/(spans[i]['end']+1-spans[i]['start']+spans[j]['end']+1-spans[j]['start'])) # cos_score +  length of text

        similar_spans.append({
            'score': score, 'cos:':cos_score
            ,'span1':spans[i]
            ,'span2':spans[j]})
  similar_spans = sorted(similar_spans, key=lambda d: d['score'], reverse=True)
  return similar_spans

In [9]:
def get_spans(words, stopwords):
  spans=[]
  regexp = re.compile(r'[MSK\d*]')
  range(len(words))
  for i in range(len(words)):
    for j in range(i+1, len(words)+1):
      c = False
      for w in words[i:j]:
        if w in stopwords:
          c = True
          break
      if c: continue

      s = ' '.join(words[i:j])
      # s=s.replace(' ,','')
      if(regexp.search(s) or s ==','):
        continue
      spans.append({"text":s, "start":i, "end": j-1})
  return spans

In [10]:
def coref_mask_and_tokenize(text, core_nlp_client):
  msk_id=0
  document = core_nlp_client.annotate(text)
  corefs = document.corefChain
  # print(corefs)
  resolved = []
  chain_mask = {}
  for chain in corefs:
    if len(chain.mention) == 1:
      continue
    msk_id+=1
    chain_mask[chain.chainID]='[MSK'+str(msk_id)+']'

  words = []
  words_lemmatized = []
  for sentence in document.sentence:
    for token in sentence.token:
        corefClustId = token.corefClusterID
        chain = None
        for c in corefs:
          if c.chainID == corefClustId:
            chain = c
            break

        if chain is None or len(chain.mention) == 1:
          word = token.word.lower()
          word_l = token.lemma
        else:
          word = chain_mask.get(chain.chainID)
          word_l = word
        words.append(word)
        words_lemmatized.append(word_l)
  return words, words_lemmatized, msk_id


In [11]:
def get_stopwords():
    # Open the file and read the stopwords
    with open("stopwords.txt", "r") as text_file:
        stopwords = text_file.read().splitlines()

    return stopwords

In [12]:
def remove_quotes(text):
    result = []
    for word in text:
        new_word = word.replace('"', '').replace('”', '').replace('“', '')
        result.append(new_word)
    return result

In [13]:
def print_help(obj, name='Obj'):
  print(name, '=', end=" ")
  for item in obj:
    print(item, end =" "),
  print()

In [14]:
def mask_word_ranges(words, mask_ranges, msk_index):
  d = dict(enumerate(map(str, words)))
  for msk in mask_ranges:
    # print(msk)
    msk_index+=1
    msk_txt = '[MSK'+str(msk_index)+']'
    for r in msk:
      d[r[0]]= msk_txt
      for i in range(r[0]+1,r[1]+1):
        if(i in d):
          d.pop(i)
  arr = [v for k, v in d.items()]
  return arr

In [15]:
def mask_text_file(text_arr, core_nlp_client, out_file_name, write_mode, model):
  stopwords = get_stopwords()

  text_arr = remove_quotes(text_arr)

  result = []
  write_queue=[]
  total = len(text_arr)
  i = 0

  for text in text_arr:
    i+=1
    words, words_lemmatized, mask_count = coref_mask_and_tokenize(text, core_nlp_client)
    # print(words)
    spans = get_spans(words_lemmatized, stopwords)
    print(f'\r{round((i / total) * 100, 2)}% / 100% (', i , ')', end='', flush=True)

    if not spans:
      result=text
      write_queue.append(result+'\n')
      print('\nNo valid spans')
      continue

    similar_spans = get_similar_spans(spans, .91, model)
    mask_ranges = spans_to_ranges(similar_spans)

    maksed_words = mask_word_ranges(words, mask_ranges, mask_count)
    # maksed_words = mask_word_ranges(words_lemmatized, mask_ranges, mask_count)

    result = format_text(maksed_words)
    write_queue.append(result + '\n')

  write_queue_df = pd.DataFrame(write_queue)
  write_queue_df.to_csv(out_file_name, index=False, quoting=1)
  print('\nFinished!')


In [16]:
def mask_text(text_arr, core_nlp_client, model):
  stopwords = get_stopwords()
  text_arr = remove_quotes(text_arr)
  result = []
  total = len(text_arr)

  for text in text_arr:
    words, words_lemmatized, mask_count = coref_mask_and_tokenize(text, core_nlp_client)
    spans = get_spans(words_lemmatized, stopwords)
    if not spans:
      result.append(text)
      continue

    similar_spans = get_similar_spans(spans, .90, model)
    mask_ranges = spans_to_ranges(similar_spans)

    maksed_words = mask_word_ranges(words, mask_ranges, mask_count)
    # maksed_words = mask_word_ranges(words_lemmatized, mask_ranges, mask_count)

    result.append(format_text(maksed_words))

  return  result

# NLI

In [17]:
!wget https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/mapping.json -O 'mapping.json'

--2024-06-18 23:18:08--  https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/mapping.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1521 (1.5K) [text/plain]
Saving to: ‘mapping.json’


2024-06-18 23:18:08 (27.6 MB/s) - ‘mapping.json’ saved [1521/1521]



In [18]:
!pip install transformers pandas scikit-learn



In [19]:
import pandas as pd
import json
import re
from transformers import pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sb
import matplotlib.pyplot as plt
import os

In [20]:
def get_labels(file_name):
  with open(file_name, 'r') as file:
      candidate_labels = json.load(file)

  # first_items = []
  second_items = []

  for key, value in candidate_labels.items():
      # first_items.append(key)
      second_items.append(value)

  # print(second_items)
  return second_items

In [21]:
def map_mask_to_understandable(arr):
  with open('mapping.json', 'r') as file:
      labels = json.load(file)

  first_items = list(labels.keys())
  second_items = list(labels.values())

  mapped_arr = []
  for a in arr:
      if a in second_items:
          index = second_items.index(a)
          mapped_arr.append(first_items[index])
      else:
          mapped_arr.append(a)

  return mapped_arr

In [22]:
def classify_sequence(args,clasifier):
    line, candidate_labels = args
    result = clasifier(line, candidate_labels)
    highest_score_index = result['scores'].index(max(result['scores']))
    highest_score_label = result['labels'][highest_score_index]
    highest_score = max(result['scores'])

    return result['labels'][0]
    # return highest_score_label, highest_score
    # print(line, highest_score_label, '\n', highest_score, '\n', result['scores'])
    # print('<--------------------------------->')

In [23]:
def do_classification(input_text, mapping, total, model, save, isFile):
  classifier = pipeline("zero-shot-classification", model)
  candidate_labels = get_labels(mapping)

  if isFile == 'y':
    df = pd.read_csv(input_text)
    # df = df.iloc[0:total] # first total lines
    # df = df.sample(n=total, random_state=1) # random total lines
    true_labels = df['updated_label'].tolist()
    true_labels = map_mask_to_understandable(true_labels)
    df = df['source_article']
    # print('T = ', '\n', true_labels)
    # print('Tlen = ', len(true_labels))
    # print('C = ', '\n', candidate_labels)

    cnt = 1
    predictions = []

    for line in df:
      prediction = classify_sequence([line, candidate_labels],classifier)
      predictions.append(prediction)
      # print(cnt, '/', total)
      print(f'\r{round((cnt / total) * 100, 2)}% / 100% (', cnt , ')', end='', flush=True)
      cnt += 1

    predictions = map_mask_to_understandable(predictions)
    # print('P = ', '\n', predictions)
    # print('Plen = ', len(predictions))
    all_labels = map_mask_to_understandable(candidate_labels)

    # unique_labels = list(set(true_labels + predictions))
    report = classification_report(true_labels, predictions, labels = all_labels)
    print('\n', report)

    matrix = confusion_matrix(true_labels, predictions, labels=all_labels)
    sb.heatmap(matrix, xticklabels=all_labels, yticklabels=all_labels, annot=True, fmt="d")
    plt.xticks(rotation=90)
    os.makedirs('results', exist_ok=True)
    if save == 'y':
      plt.savefig('results/heatmap.png', bbox_inches='tight')
    plt.show()

    result_df = pd.DataFrame({
        'actual': true_labels,
        'prediction': predictions
    })
    if save == 'y':
      result_df.to_csv('results/act_pred.csv')
      report_df = pd.DataFrame([report])
      report_df.to_csv('results/report.csv', index=False)
      !zip -r results.zip 'results'
  else:
      prediction = classify_sequence([input_text, candidate_labels],classifier)
      print(prediction)

# DEMO

In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'coref'],
    memory='4G',
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)
client.start()

INFO:stanza:Writing properties to tmp file: corenlp_server-10f3169ebde040ad.props
INFO:stanza:Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-10f3169ebde040ad.props -annotators tokenize,ssplit,pos,lemma,ner,coref -preload -outputFormat serialized


<stanza.server.client.CoreNLPClient object at 0x7fd3b431b280>


In [None]:
# # Load the pre-trained model
# with open("nb_classifier.pickle", "rb") as dmp:
#     nb = pickle.load(dmp)
#     print("[I] NB classifier loaded from a file")


cos_sim_model = SentenceTransformer('bert-base-nli-mean-tokens')
mapping = 'mapping.json'
model = 'cross-encoder/nli-distilroberta-base'
# model = 'facebook/bart-large-mnli'
# model = 'facebook/bart-large-mnli'
# model = "google/electra-large-discriminator"
save = 'y'
isFile = 'n'
total = 1
# If Joe eats greasy food, he will feel sick. Given now that Joe feels sick, therefore, Joe must have had greasy food
# Jack is a good athlete. Jack comes from Canada. Therefore, all Canadians are good athletes
while True:
    text = input("\nEnter a text to classify: ")
    if len(text) == 0: break

    # Extract text features for classification
    masked_array =  mask_text([text], client, cos_sim_model)
    print("\nMasked text:", masked_array[0], "\n")
    input_text = masked_array[0]


    do_classification(input_text, mapping, total, model, save, isFile)



Enter a text to classify: Annie must like Starbucks because all white girls like Starbucks.

Masked text: Annie must [MSK2] [MSK1] because all white girls [MSK2] [MSK1]. 

[MSK1] is true because of [MSK2]. [MSK2] is true because of [MSK1].


KeyboardInterrupt: Interrupted by user

In [None]:
client.stop()

# Testing

- Pieliku klat profileri (visu laiku patere coref_mask_and_tokenize() un get_similar_spans());
- Uzliku skaistaku progresa printu;
- Pieliku get_stopwords klat paris stopwordus;
- Ieliku savu format_text, jo bija problemas ar lielajiem burtiem, punktiem, pēdiņām utt.
- Pieliku metodi remove_quotes.
- Pamainiju mask_text faila izveidi.
- output.csv fails ar 1108 rindinam:
https://github.com/HaraldsU/VTP_LPD/blob/main/output.csv


In [24]:
!wget "https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/input_full.csv" -O 'input_full.csv'

--2024-06-18 23:18:16--  https://raw.githubusercontent.com/HaraldsU/VTP_LPD/main/Data/input_full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 865823 (846K) [text/plain]
Saving to: ‘input_full.csv’


2024-06-18 23:18:16 (4.25 MB/s) - ‘input_full.csv’ saved [865823/865823]



In [25]:
def remove_excessive_lines(df, max_words=100):
    # Filter rows where the word count in 'source_article' is less than or equal to max_words
    df_filtered = df[df['source_article'].apply(lambda x: len(x.split()) <= max_words)]
    return df_filtered

In [27]:
# text = ["Jack is a good athlete. Jack comes from Canada. Therefore, all Canadians are good athletes. Good athletes always win."] # [MSK1] is a [MSK2]. [MSK1] comes from [MSK3]. Therefore, all [MSK3] are [MSK2].
# text = ['If Joe eats greasy food, he will feel sick. Given now that Joe feels sick, therefore, Joe must have had greasy food.']
# text = ["McDonald's Hamburgers: over 99 billion served."]
# text = ['men don’t cry']
# text = ['David is so wrong about Luna\'s work ethic. David is just an egotistical jerk with a God complex, what does he know?']
# text  = ['Mob of people: Lower taxes! Lower taxes! Politician: People, your taxes are high because of illegal immigrants. That\'s right—illegal immigrants. We need to get rid of them. Mob of people: (murmuring amongst themselves) Hmmm... immigrants. Let\'s get rid of them!']
# text  = ['Trump cites (biased) poll results showing that people think he\'s a strong leader to prove a point that he is a strong leader. ABC News/Washington Post Poll (wrong big on election) said almost all stand by their vote on me & 53% said strong leader.']
model = SentenceTransformer('bert-base-nli-mean-tokens')
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'coref'],
    memory='4G',
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)
client.start()


# df = pd.read_csv('input.csv')
# df = df[0:100]
# selected_rows = df.iloc[0:99] # First 99 rows

df = pd.read_csv('input_full.csv')
df_reduced = remove_excessive_lines(df,100)
df_reduced.to_csv('input_reduced.csv', index=False)

rows = df_reduced['source_article']

# rows = text
print('Row count = ', len(rows))

# %lprun -f mask_text_file mask_text_file(rows, client, "output_full.csv", 'w', model)
mask_text_file(rows, client, "output_reduced.csv", 'w', model)
client.stop()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
INFO:stanza:Writing properties to tmp file: corenlp_server-a63622d9182b48c3.props
INFO:stanza:Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-a63622d9182b48c3.props -annotators tokenize,ssplit,pos,lemma,ner,coref -preload -outputFormat serialized


<stanza.server.client.CoreNLPClient object at 0x7f9663c472e0>
Row count =  2210
15.57% / 100% ( 344 )
No valid spans
34.43% / 100% ( 761 )
No valid spans
39.46% / 100% ( 872 )
No valid spans
74.62% / 100% ( 1649 )
No valid spans
75.16% / 100% ( 1661 )
No valid spans
76.33% / 100% ( 1687 )
No valid spans
100.0% / 100% ( 2210 )
Finished!


In [28]:
output = pd.read_csv('output_reduced.csv')
input = pd.read_csv('input_reduced.csv')

input['source_article'] = output
input.to_csv('output_reduced.csv', index=False)


In [29]:
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModelForCausalLM, AutoModelForSequenceClassification

In [None]:
  mapping = 'mapping.json'
  input_text = 'output_full.csv'
  # input_text = 'Hah aha I am you are.'
  # model = 'cross-encoder/nli-distilroberta-base'
  # model = AutoModelForPreTraining.from_pretrained("google/electra-small-discriminator")
  # model = AutoModelForPreTraining.from_pretrained("google/electra-large-discriminator")
  # model = AutoModelForSequenceClassification.from_pretrained("sileod/deberta-v3-small-tasksource-nli")
  model = "cross-encoder/nli-distilroberta-base" # Even faster
  save = 'y'
  isFile = 'y'
  total = 2222

  do_classification(input_text, mapping, total, model, save, isFile)