In [None]:
# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
# !apt-get update -qq 2>&1 > /dev/null
# !apt-get -y install -qq google-drive-ocamlfuse fuse
# from google.colab import auth
# auth.authenticate_user()
# from oauth2client.client import GoogleCredentials
# creds = GoogleCredentials.get_application_default()
# import getpass
# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
# vcode = getpass.getpass()
# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
# %cd /content
# !mkdir drive
# %cd drive
# !mkdir MyDrive
# %cd ..
# %cd ..
# !google-drive-ocamlfuse /content/drive/MyDrive

------
### Library setup and mounting g-drive 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install --quiet transformers
!pip install --quiet fasttext
# !pip install --quiet tensorflow==1.15.0
# restart after running the above line

In [None]:
import numpy as np
import pandas as pd
import re

from tqdm.notebook import tqdm
import torch
import fasttext
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import nltk
import json
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import networkx as nx
from sklearn import preprocessing
import transformers

nltk.download("punkt") 
nltk.download('stopwords')
tqdm.pandas()

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

In [None]:
# # download pretrained EMLO embeddings
# %mkdir /content/module
# # Download the module, and uncompress it to the destination folder. 
# !curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC /content/module/
# # instantiate pretrained ELMO model
# pretrained_elmo = hub.Module("/content/module/", trainable=False)

In [None]:
# initialize fasttext model
path_to_embeddings = "./embeddings/fasttext-embeddings/cc.en.300.bin"
model = fasttext.load_model(path_to_embeddings)

In [None]:
# PRE processing utility methods
# remove \n\n
def truncate_newline(text):
  # remove double new-line character
  text = text.replace("\n\n", " ")
  return text

# download JSON based file in dictionary format
def get_json(path_to_file):
  with open(path_to_file, 'r') as openfile:
      articles = json.load(openfile)
  return articles

# save dict -> json
def to_json(save_location, file_name, dict_: dict):
  try:
    with open(f'{save_location}/{file_name}', 'w') as fp:
      json.dump(dict_, fp)
    return True 
  except Exception as e:
    return e

# split text articles into sentences and append to a main dictionary
def generate_sentences(data: pd.DataFrame, key='id', value='text'):
  article_sentences = dict()
  for _, row in tqdm(data.iterrows(), total=data.shape[0]):
    id, text = row[key], row[value]
    sentences = sent_tokenize(text)
    # remove elements equals dot '.'
    sentences = [sentence for sentence in sentences if sentence != '.']
    article_sentences[id] = sentences
  return article_sentences

def modifiy_gen_sentences(dict_: dict):
  out_dict_ = dict()
  for key, value in dict_.items():
    out_dict_[key] = {
        'text': value
    }
  return out_dict_

# define stopwords
stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

# preprocessing
def preprocess(sentences: list):
  # remove punctuations, numbers and special characters
  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
  # make alphabets lowercase
  clean_sentences = [s.lower() for s in clean_sentences]
  # remove stopwords from the sentences
  clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
  return clean_sentences

# generate sentence vectors
def get_sentence_vectors(sentences: list, word_embeddings: dict, dim: int=100, fasttext=False, elmo=False):
  sentence_vectors = []
  for i in sentences:
    if fasttext:
      if len(i) != 0:
        v = sum([model.get_word_vector(w) for w in i.split()])/(len(i.split())+0.001)
      else: 
        v = np.zeros((dim,))
    else:
      print("....")
      if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/(len(i.split())+0.001)
      else:
        v = np.zeros((dim,))
    # append sentence vectors
    sentence_vectors.append(v)
  return sentence_vectors

# compute the similarity scores between sentences
def calc_pairwise_similarity(sentences: list, sentence_vectors: list, dim: int=100):
  # similarity matrix
  sim_mat = np.zeros([len(sentences), len(sentences)])
  # calculate pairwise similarity "cosine-similarity"
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i != j:
        sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim), sentence_vectors[j].reshape(1,dim))[0,0]
  return sim_mat

# split scores and ranked texts
def process_ranked_text(ranked_sentences: list):
  scores, sorted_texts = list(zip(*ranked_sentences))
  return list(scores), list(sorted_texts)

# processing method for process_batch
def process_instance(sentences: list, dim: int=100, fasttext=False, elmo=False):
  # preprocess sentences
  clean_sentences = preprocess(sentences)
  # retrieve corresponding sentence vectors
  sentence_vectors = get_sentence_vectors(clean_sentences, word_embeddings, dim, fasttext, elmo)
  # similarity matrix
  sim_mat = calc_pairwise_similarity(sentences, sentence_vectors, dim)
  # create graph and apply pagerank algorithm
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph, max_iter=600)
  # sort the scored sentences
  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
  return ranked_sentences

# processing method for process_batch 'semeval'
def process_instance_semeval(sentences: list, dim: int=100, fasttext=False, elmo=False):
  # preprocess sentences
  clean_sentences = preprocess(sentences)
  # retrieve corresponding sentence vectors
  sentence_vectors = get_sentence_vectors(clean_sentences, word_embeddings, dim, fasttext, elmo)
  # similarity matrix
  sim_mat = calc_pairwise_similarity(sentences, sentence_vectors, dim)
  # create graph and apply pagerank algorithm
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph, max_iter=600)
  # sort the scored sentences
  ranked_sentences = [(scores[idx], sentence) for idx, sentence in enumerate(sentences)]
  return ranked_sentences

In [None]:
# POST processing utility methods
# split sentences using the partition key
def split_content(List: list, key: int):
  return List[:key], List[key:]

# sort sentences w.r.t to scores
def sort_sentences(sentences_: list, scores_: list):
  return sorted(((scores_[idx], sentence) for idx, sentence in enumerate(sentences_)), reverse=True)

# post batch processing
def post_batch_process(ranked_sentence_dict: dict, merge_dict: dict):
  # final dict to return
  final_partitioned_dict = dict()
  # --DEBUG--
  # faulty indices 
  faulty_indices = []
  for key, value in tqdm(ranked_sentence_dict.items(), total=len(ranked_sentence_dict.keys())):
    try:
      # retrieve key from main dictionary
      partition_key = merge_dict[key]['partition_no']
      # split scores, sentences & index
      scores_A, scores_B = split_content(value['score'], partition_key)
      text_A, text_B = split_content(value['text'], partition_key)
      index_A, index_B = key.split('_')
      # sort sentences 
      scores_a, text_a = process_ranked_text(sort_sentences(text_A, scores_A))
      scores_b, text_b = process_ranked_text(sort_sentences(text_B, scores_B)) 
      # attach instances to the dict
      final_partitioned_dict[index_A] = dict(
          score = scores_a,
          text = text_a,
      )
      final_partitioned_dict[index_B] = dict(
          score = scores_b,
          text = text_b,
      )
    except Exception as e:
      faulty_indices.append((key))
  return final_partitioned_dict, faulty_indices

# merge 2 dictionaries of varied length
def merge_dictionaries(a, b):
   return {**a, **b}

# get pairs
def get_pair_indices(data: pd.DataFrame, sep='_'):
  index_pair = data.progress_apply(
      lambda row: row['pair_id'].split(sep),
      axis=1
  )
  id_1, id_2 = list(zip(*index_pair.to_list()))
  return id_1, id_2

# generate pair dict
def generate_pair_dict(target_dict: dict, index_a: list, index_b: list):
  proposed_dict = dict()
  # debug 
  faulty_pairs = []
  # -----
  for serial_nos, (id_1, id_2) in tqdm(enumerate(zip(index_a, index_b)), total=len(index_a)):
    # note: not attaching the scores
    try:
      proposed_dict[f'sr_{serial_nos}'] = dict(
          index_a=id_1,
          index_b=id_2,
          text_a=target_dict[id_1]['text'],
          text_b=target_dict[id_2]['text'],
      )
    except Exception as e:
      faulty_pairs.append(f"{id_1}_{id_2}")
  return proposed_dict, faulty_pairs

# apply sorting to ranked data to get top-k most preferable sentences
def reorder_ranked_data(ranked_dict: dict):
  reordered_dict = dict()
  for key, value in tqdm(ranked_dict.items(), total=len(ranked_dict.keys())):
    scores = value['score']
    sentences = value['text']
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    sorted_scores, sorted_ranked_texts = process_ranked_text(ranked_sentences)
    reordered_dict[key] = {
        'score': sorted_scores,
        'text': sorted_ranked_texts,
    }
  return reordered_dict

# extract top_k pairs
def extract_top(pair: dict, tokenizer: transformers.AutoTokenizer, k=12, MAX_TOKENS=512):
  # create temp. dictionary
    result = {
        pair["index_a"]: [],
        pair["index_b"]: []
    }
    token_count = 0
    for i in range(0, k):
      # DOCUMENT A
      if (i < len(pair['text_a'])):
        # (sentence, score)
        sentence_a = pair['text_a'][i]
        # get tokens -> sentence
        tokens_a = tokenizer.encode(
            sentence_a,
            max_length=512,
            truncation=True,
        )
        # check for total size of overall text i.e. approved(text_a) + <sep> + approved(text_b)
        add_len_a = min(len(tokens_a), max(0, MAX_TOKENS - token_count))
        if (add_len_a == 0):
          break
        # if total_len < MAX_TOKEN_COUNT :-
        result[pair['index_a']].append(tokenizer.decode(tokens_a[0:add_len_a], skip_special_tokens=True))
        token_count += add_len_a
      # DOCUMENT B
      if (i < len(pair['text_b'])):
        # (sentence, score)
        sentence_b = pair['text_b'][i]
        # get tokens -> sentence
        tokens_b = tokenizer.encode(
              sentence_b,
              max_length=512,
              truncation=True,
        )
        # check for total size of overall text i.e. approved(text_a) + <sep> + approved(text_b)
        add_len_b = min(len(tokens_b), max(0, MAX_TOKENS - token_count))
        if (add_len_b == 0):
          break
        # if total_len < MAX_TOKEN_COUNT :-
        result[pair['index_b']].append(tokenizer.decode(tokens_b[0:add_len_b], skip_special_tokens=True))
        token_count += add_len_b

    return result

# --------------------------------------------------------------------------------
# Extract top k sentence from ranked hyperpartisan dataset
def extract_top_hyp(text: list, tokenizer: transformers.AutoTokenizer, k=12, MAX_TOKENS=512):
    token_count = 0
    result = []
    for i in range(0, k):
      # DOCUMENT A
      if (i < len(text)):
        # (sentence, score)
        sentence = text[i]
        # get tokens -> sentence
        tokens = tokenizer.encode(
            sentence,
            max_length=512,
            truncation=True,
        )

        add_len_a = min(len(tokens), max(0, MAX_TOKENS - token_count))
        if (add_len_a == 0):
          break
        # if total_len < MAX_TOKEN_COUNT :-
        result.append(tokenizer.decode(tokens[0:add_len_a], skip_special_tokens=True))
        token_count += add_len_a
    return result

In [None]:
# glove methods
# apply textrank on single instance
def process_single(sentences: list, word_embeddings: dict, dim: int=100, fasttext=False, elmo=False):
  # preprocess sentences
  clean_sentences = preprocess(sentences)
  # retrieve corresponding sentence vectors
  sentence_vectors = get_sentence_vectors(clean_sentences, word_embeddings, dim, fasttext, elmo)
  # similarity matrix
  sim_mat = calc_pairwise_similarity(sentences, sentence_vectors, dim)
  # create graph and apply pagerank algorithm
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)
  # sort the scored sentences
  # ranked_sentences = [(scores[idx], sentence) for idx, sentence in enumerate(sentences)]
  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
  return ranked_sentences

# process entire data
def process_batch(article_sentences: dict, word_embeddings, dim=100, file_name='preprocess.json', flag=False, fasttext=False, elmo=False):
  count = 0
  overall_dict = dict()
  faulty_ids = []
  for key, value in tqdm(article_sentences.items(), total=len(article_sentences.keys())):
    try:
      if flag:
        sorted_sentences = process_instance(value['text'], dim, fasttext, elmo)
      else:
        sorted_sentences = process_instance_semeval(value['text'], dim, fasttext, elmo)
      scores, ranked_texts = process_ranked_text(sorted_sentences)
      count += 1
    except Exception as e:
      print(e)
      faulty_ids.append(key)
      continue
    overall_dict[key] = dict(
        score=scores,
        text=ranked_texts
    )
    # save after every 1000 counts
    if count%500 == 0:
      to_json(path_to_dir, file_name, overall_dict)

  return overall_dict, faulty_ids

# USE BERT IF RESULTS ARE NOT BETTER.

In [None]:
# download and unzip glove embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

# Extract word vectors 
word_embeddings = {}
path_to_glove_embeddings = "./embeddings/glove.6B.100d.txt"
f = open(path_to_glove_embeddings, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
len(word_embeddings)  

In [None]:
# initialize huggingface tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('xlm-roberta-base')

-----
#### PROCESSING HYPERPARTISAN DATA

In [None]:
# path_to_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/part_0.csv'
# data_df = pd.read_csv(path_to_file)

In [None]:
# # # retrieve sentences w.r.t ids
# article_sentences = generate_sentences(data_df, key='idx', value='text')
# article_sentences = modifiy_gen_sentences(article_sentences)

In [None]:
# # splitted dataset into 3 equal parts and ran parallel notebooks for faster execution. 
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data"
# processed_dict, faulty_unique_ids = process_batch(article_sentences, word_embeddings, dim=100, file_name='preprocessed_data.json')
# # convert keys to str
# processed_dict = {str(key): value for key, value in processed_dict.items()}
# # save dictionary to json 
# to_json(path_to_dir, 'preproc_part2.json', processed_dict)

-----

In [None]:
# # join and concat ranked data using a topk extraction
# ranked_data = [{'idx': key, 'text': " ".join(extract_top_hyp(value['text'], tokenizer))} for key, value in tqdm(processed_dict.items(), total=len(processed_dict.keys()))]
# processed_df = pd.DataFrame(ranked_data, columns=['idx', 'text'])       # processed & ranked data
# target_df = data_df[['idx', 'title', 'bias', 'hyperpartisan']]          # target dataframe
# # change dtype
# target_df['idx'] = target_df['idx'].astype('str')     
# processed_df = pd.merge(processed_df, target_df, on='idx')              # merged dataframe
# # save processed dataframe
# # processed_data__.to_csv(f'{path_to_dir}/final_data.csv', index=False) # 💀

`Data files are merged and is saved as file_data.csv`

---



------
#### PROCESS TEST DATA

In [None]:
# # main semeval data
# path_to_semeval_data = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/test_v1.csv"
# main_df = pd.read_csv(path_to_semeval_data)
# # combination of lang1 and lang2 pairs required
# combination_df = pd.DataFrame({
#     'url1_lang': main_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})['url1_lang'].tolist(),
#     'url2_lang': main_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})['url2_lang'].tolist()
# })
# # resultant dataframe
# target_df = pd.merge(main_df, combination_df)
# # check for the retrieved data
# target_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})

In [None]:
# # non-english data
# title_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/title/merged_test_title.csv')
# desc_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/extra_text/merged_test_extra_text.csv')
# text_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/merged_test_text_v1.csv')
# # english data
# title_en_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/english_test/title_en.csv')
# desc_en_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/english_test/extra_text_en.csv')
# text_en_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/english_test/doc_en.csv')
# # non-english data
# print(title_df.shape, desc_df.shape, text_df.shape)
# # english data
# print(title_en_df.shape, desc_en_df.shape, text_en_df.shape)
# # rename data
# title_df.rename(columns={'text': "title"}, inplace=True, errors="raise")
# desc_df.rename(columns={'text': 'extra_text'}, inplace=True, errors="raise")
# # # concat files
# main_title_df = pd.concat([title_df, title_en_df])
# main_desc_df = pd.concat([desc_df, desc_en_df])
# main_text_df = pd.concat([text_df, text_en_df])
# print(main_title_df.shape, main_desc_df.shape, main_text_df.shape)
# # merge DataFrames
# main_df = pd.merge(main_title_df, main_desc_df, on='id', how='outer')
# main_df = pd.merge(main_df, main_text_df, on='id', how='outer')   
# main_df.fillna('', inplace=True)
# # save file
# main_df.to_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/translated_test_v1.csv', index=False)
# main_df.shape

In [None]:
# # get index pairs
# index_a, index_b = get_pair_indices(target_df)

# # path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data/merged_translation_data.csv'
# path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/translated_test_v1.csv'
# data_df = pd.read_csv(path_to_merged_file)
# data_df['id'] = data_df['id'].astype(str)

# data_df.fillna(' ', inplace=True)

# # merge title, desc and text
# data_df['new_text'] = data_df.progress_apply(
#     lambda row: row['title'] + '.' + row['extra_text'] + '.' + row['text'],
#     axis = 1
# )

In [None]:
# # get index pairs
# index_a, index_b = get_pair_indices(target_df)

# # path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data/merged_translation_data.csv'
# path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data_test/translated_test_v1.csv'
# data_df = pd.read_csv(path_to_merged_file)
# data_df['id'] = data_df['id'].astype(str)

# data_df.fillna(' ', inplace=True)

# # merge title, desc and text
# data_df['new_text'] = data_df.progress_apply(
#     lambda row: row['title'] + '.' + row['extra_text'] + '.' + row['text'],
#     axis = 1
# )

# data_df.drop(['title', 'extra_text', 'text'], axis=1, inplace=True)
# data_df.rename(columns={
#     'new_text': 'text'
# }, inplace=True, errors="raise")

# # retrieve sentences w.r.t ids
# article_sentences = generate_sentences(data_df)

# # # get final dict for textrank processing
# Final_dict = dict()
# faulty_ids = []
# for serial_nos, (id_1, id_2) in tqdm(enumerate(zip(index_a, index_b)), total=len(index_a)):
#   try:
#     # retrieve text respective to the ids 
#     text_a = article_sentences[id_1]
#     text_b = article_sentences[id_2]
#     # assign the unique id
#     unique_id = f'{id_1}_{id_2}'
#     # extend the list of sentence from text_a and text_b
#     text_a.extend(text_b)
#     # attach to the Final_dict
#     Final_dict[unique_id] = dict(
#         text = text_a,
#         partition_no = len(text_a)-len(text_b)
#     )
#   except Exception as e:
#     faulty_ids.append((id_1, id_2))

In [None]:
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv"
# processed_dict, faulty_unique_ids = process_batch(Final_dict, word_embeddings, dim=300, fasttext=True, file_name="preprocess_test_v2.json")
# # Save file as JSON.
# to_json(save_location=path_to_dir, file_name="ranked_merged_test_dict_v2_elmo.json", dict_=processed_dict) # processed dict
# to_json(save_location=path_to_dir, file_name="merged_&_pos_test_data_v2_elmo.json", dict_=Final_dict) # Final dict

In [None]:
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv"
# # load final and procesed dict
# ranked_dict = get_json(f"{path_to_dir}/ranked_merged_test_dict_v1.json")
# final_dict = get_json(f"{path_to_dir}/merged_&_pos_test_data_v1.json")

# # apply post ranking processes
# ranked_data, faulty_indices = post_batch_process(ranked_dict, final_dict)
# # apply sorting to each individual articles 
# ranked_data = reorder_ranked_data(ranked_data)

# # save ranked_data
# to_json(path_to_dir, 'ranked_semeval_test_data_v2.json', ranked_data)

In [None]:
# # ranked data
# path_to_ranked_data = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv/ranked_semeval_test_data_v2.json'
# ranked_dict = get_json(path_to_ranked_data)

# # main semeval data
# path_to_semeval_data = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/test_v1.csv"
# main_df = pd.read_csv(path_to_semeval_data)
# # combination of lang1 and lang2 pairs required
# combination_df = pd.DataFrame({
#     'url1_lang': main_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})['url1_lang'].tolist(),
#     'url2_lang': main_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})['url2_lang'].tolist()
# })
# # resultant dataframe
# target_df = pd.merge(main_df, combination_df)
# # check for the retrieved data
# target_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})

In [None]:
# # generate paired data
# index_a, index_b = get_pair_indices(target_df)
# final_dict, faulty_pairs = generate_pair_dict(ranked_dict, index_a, index_b)

# # apply top k extraction method to entire data.
# stratified_final_dict = defaultdict(list)
# for serial_nos in tqdm(final_dict.keys(), total=len(final_dict.keys())):
#   # keys = range(sr_0, sr_1, ...., sr_N)
#   data = final_dict[serial_nos]
#   # extracting top_k sentences (sorted)
#   output = extract_top(data, tokenizer, k=12, MAX_TOKENS=512)

#   stratified_final_dict['index_a'].append(data['index_a'])
#   stratified_final_dict['index_b'].append(data['index_b'])
#   stratified_final_dict['text_a'].append(" ".join(output[data['index_a']]))
#   stratified_final_dict['text_b'].append(" ".join(output[data['index_b']])) 
  
# # create final dataframe
# finalised_df = pd.DataFrame(stratified_final_dict)
# finalised_df['pair_id'] = finalised_df.progress_apply(
#     lambda row: f"{row['index_a']}_{row['index_b']}",
#     axis=1
# )
# finalised_df.drop(['index_a', 'index_b'], axis=1, inplace=True)
# finalised_df = finalised_df[['pair_id', 'text_a', 'text_b']]
# print(f"finalised_df: {finalised_df.shape}")

# # merge target and finalised dataframes based on "pair_id".
# main_df = pd.merge(target_df, finalised_df, on='pair_id', how='outer') 
# main_df.dropna(inplace=True)
# print(f"main_df: {main_df.shape}")

In [None]:
# # remove duplicates
# main_df = main_df[~main_df.duplicated()]
# save_loc = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv/final_test_v5.csv"
# main_df.to_csv(save_loc, index=False)

------
#### INITIAL PROCESSING
* Merge all the sentences in both text artilces ✔️
* Also try implementing the simialrity score used over [here](https://assistant.raxter.io/projects/semeval-2022-multilingual-document-similarity1640024433930/literatures/2101064231640675351177) ❌

In [None]:
# # main semeval data
# path_to_semeval_data = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/combined_train_v1.csv"
# main_df = pd.read_csv(path_to_semeval_data)
# # combination of lang1 and lang2 pairs required
# combination_df = pd.DataFrame({
#     'url1_lang': ['de', 'de', 'en', 'es', 'tr', 'pl', 'fr', 'ar'],
#     'url2_lang': ['de', 'en', 'en', 'es', 'tr', 'pl', 'fr', 'ar']
# })
# # resultant dataframe
# target_df = pd.merge(main_df, combination_df)
# # check for the retrieved data
# target_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})

In [None]:
Z# # get index pairs
# index_a, index_b = get_pair_indices(target_df)

# # path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data/merged_translation_data.csv'
# path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translated_train_processed_v2.csv'
# data_df = pd.read_csv(path_to_merged_file)
# data_df['id'] = data_df['id'].astype(str)

# # retrieve sentences w.r.t ids
# article_sentences = generate_sentences(data_df)

# # # get final dict for textrank processing
# Final_dict = dict()
# faulty_ids = []
# for serial_nos, (id_1, id_2) in tqdm(enumerate(zip(index_a, index_b)), total=len(index_a)):
#   try:
#     # retrieve text respective to the ids 
#     text_a = article_sentences[id_1]
#     text_b = article_sentences[id_2]
#     # assign the unique id
#     unique_id = f'{id_1}_{id_2}'
#     # extend the list of sentence from text_a and text_b
#     text_a.extend(text_b)
#     # attach to the Final_dict
#     Final_dict[unique_id] = dict(
#         text = text_a,
#         partition_no = len(text_a)-len(text_b)
#     )
#   except Exception as e:
#     faulty_ids.append((id_1, id_2))

In [None]:
# # don not run this 💀❌ 
# data_df = pd.read_csv("/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translated_train_v2.csv")
# # fill nan with ''
# data_df.fillna('', inplace=True)
# # append '.' with title and description.
# data_df['title'] = data_df.progress_apply(
#     lambda row: row['title'] if row['title'].endswith('.') or row['title'].endswith('?') else row['title']+".",
#     axis=1
# )
# data_df['desc'] = data_df.progress_apply(
#     lambda row: row['desc'] if row['desc'].endswith('.') or row['desc'].endswith('?') else row['desc']+".",
#     axis=1
# )
# # merge title + desc + text
# data_df['merged_text'] = data_df.progress_apply(
#     lambda row: row['title'] +" "+ row['desc'] +" "+ row['text'],
#     axis=1
# )
# data_df['merged_text'] = data_df.progress_apply(
#     lambda row: row['merged_text'].strip(),
#     axis=1
# )
# # drop unnecessary columns and rename merged_text
# data_df.drop(['title', 'desc', 'text'], axis=1, inplace=True)
# data_df.rename(columns={
#     'merged_text': 'text'
# }, inplace=True, errors='raise')
# # calculate word count
# data_df['word_count'] = data_df.progress_apply(
#     lambda row: len(row['text'].split(' ')),
#     axis=1
# ) 
# ####################### EXTRA PROCESSING BASED ON CONTEXT ######################
# # selecting rows with word_count > 30 (might contain some anamolies)
# factored_data = data_df[data_df['word_count'] > 30]
# # drop the analysed index 
# # rem_index = [3968, 4036, 4075, 4106, 4342, 4503, 7494, 7590, 4579,]
# rem_index = [1483881256, 1484028040, 1548289807, 1555147462, 1548275840, 1484144810, 1484084344, 1510785328, 1483931224, 1484312115, 1484026894, 1584880806, 1484084350, 1484285197, 1484084356, 1484137472, 1484285188, 1484137480, 1484109196, 1484299489, 1484339239]
# final_df = factored_data.drop(factored_data[factored_data['id'].isin(rem_index)].index.tolist())
# final_df.drop(['word_count'], axis=1, inplace=True)
# final_df.reset_index(drop=True, inplace=True)
# # save file
# # data_df.to_csv("/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translated_train_processed.csv", index=False)
# final_df.to_csv("/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translated_train_processed_v2.csv", index=False)

In [None]:
# # get index pairs
# index_a, index_b = get_pair_indices(target_df)

# # path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translation_data/merged_translation_data.csv'
# path_to_merged_file = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/translated_train_processed_v2.csv'
# data_df = pd.read_csv(path_to_merged_file)
# data_df['id'] = data_df['id'].astype(str)

# # retrieve sentences w.r.t ids
# article_sentences = generate_sentences(data_df)

# # # get final dict for textrank processing
# Final_dict = dict()
# faulty_ids = []
# for serial_nos, (id_1, id_2) in tqdm(enumerate(zip(index_a, index_b)), total=len(index_a)):
#   try:
#     # retrieve text respective to the ids 
#     text_a = article_sentences[id_1]
#     text_b = article_sentences[id_2]
#     # assign the unique id
#     unique_id = f'{id_1}_{id_2}'
#     # extend the list of sentence from text_a and text_b
#     text_a.extend(text_b)
#     # attach to the Final_dict
#     Final_dict[unique_id] = dict(
#         text = text_a,
#         partition_no = len(text_a)-len(text_b)
#     )
#   except Exception as e:
#     faulty_ids.append((id_1, id_2))

In [None]:
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv"
# processed_dict, faulty_unique_ids = process_batch(Final_dict, word_embeddings, dim=300, fasttext=True, file_name="preprocess_v2.json")
# # Save file as JSON.
# to_json(save_location=path_to_dir, file_name="ranked_merged_dict_v2.json", dict_=processed_dict) # processed dict
# to_json(save_location=path_to_dir, file_name="merged_&_pos_data_v2.json", dict_=Final_dict) # Final dict

---
#### POST PROCESSING
* split the sentence according to partition index ✔️
* sort the ranked sentences according to derived scores ✔️
* sort and rank individual lists of sentences and scores ✔️
* apply top-K extraction to pairs of index ✔️

In [None]:
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv"
# # load final and procesed dict
# ranked_dict = get_json(f"{path_to_dir}/ranked_merged_dict_v2.json")
# final_dict = get_json(f"{path_to_dir}/merged_&_pos_data_v2.json")

# # apply post ranking processes
# ranked_data, faulty_indices = post_batch_process(ranked_dict, final_dict)
# # apply sorting to each individual articles 
# ranked_data = reorder_ranked_data(ranked_data)

# # save ranked_data
# to_json(path_to_dir, 'ranked_semeval_data_v2.json', ranked_data)

-----

In [None]:
# # ranked data
# path_to_ranked_data = '/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv/ranked_semeval_data_v2.json'
# ranked_dict = get_json(path_to_ranked_data)

# # main semeval data
# path_to_semeval_data = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/combined_train_v1.csv"
# main_df = pd.read_csv(path_to_semeval_data)
# # combination of lang1 and lang2 pairs required
# combination_df = pd.DataFrame({
#     'url1_lang': ['de', 'de', 'en', 'es', 'tr', 'pl', 'ar', 'fr'],
#     'url2_lang': ['de', 'en', 'en', 'es', 'tr', 'pl', 'ar', 'fr']
# })
# # resultant dataframe
# target_df = pd.merge(main_df, combination_df)
# # check for the retrieved data
# target_df.groupby(['url1_lang','url2_lang']).size().reset_index().rename(columns={0:'count'})

In [None]:
# # generate paired data
# index_a, index_b = get_pair_indices(target_df)
# final_dict, faulty_pairs = generate_pair_dict(ranked_dict, index_a, index_b)

# # apply top k extraction method to entire data.
# stratified_final_dict = defaultdict(list)
# for serial_nos in tqdm(final_dict.keys(), total=len(final_dict.keys())):
#   # keys = range(sr_0, sr_1, ...., sr_N)
#   data = final_dict[serial_nos]
#   # extracting top_k sentences (sorted)
#   output = extract_top(data, tokenizer, k=12, MAX_TOKENS=512)

#   stratified_final_dict['index_a'].append(data['index_a'])
#   stratified_final_dict['index_b'].append(data['index_b'])
#   stratified_final_dict['text_a'].append(" ".join(output[data['index_a']]))
#   stratified_final_dict['text_b'].append(" ".join(output[data['index_b']])) 
  
# # create final dataframe
# finalised_df = pd.DataFrame(stratified_final_dict)
# finalised_df['pair_id'] = finalised_df.progress_apply(
#     lambda row: f"{row['index_a']}_{row['index_b']}",
#     axis=1
# )
# finalised_df.drop(['index_a', 'index_b'], axis=1, inplace=True)
# finalised_df = finalised_df[['pair_id', 'text_a', 'text_b']]
# print(f"finalised_df: {finalised_df.shape}")

# # merge target and finalised dataframes based on "pair_id".
# main_df = pd.merge(target_df, finalised_df, on='pair_id', how='outer') 
# main_df.dropna(inplace=True)
# print(f"main_df: {main_df.shape}")

In [None]:
# # remove duplicates
# main_df = main_df[~main_df.duplicated()]
# save_loc = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv/final_data_v2.csv"
# main_df.to_csv(save_loc, index=False)

-----
#### PREPROCESS SEMEVAL DATASET

In [None]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [None]:
main_df = pd.read_csv('../dataset/final_data.csv')
main_df.dropna(inplace=True)

In [None]:
# preprocessing methods
# replace non definite punctuations
def replace_content(text):
  replace_dict = {
      '“': '\'',
      '”': '\'',
      '‘': '\'',
      '’': '\'',
  }
  for key, value in replace_dict.items():
    text = text.replace(key, value)
  return text

# download JSON based file in dictionary format
def get_json(path_to_file):
  with open(path_to_file, 'r') as openfile:
      articles = json.load(openfile)
  return articles

# get contraction file in JSON format
CONTRACTION_MAP = get_json('../dataset/contractions.json')

# replace pronounciation chars with normalized alphabets
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

# remove content withing brackets
def remove_text_within_brackets(text):
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\(.*?\)', '', text)
  return text

# remove URLs
def remove_hyperlinks(text):
  text = re.sub('https?://\S+|www\.\S+', '', text)
  return text

# expand contraction
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
  contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                    flags=re.IGNORECASE|re.DOTALL)
  def expand_match(contraction):
      match = contraction.group(0)
      first_char = match[0]
      expanded_contraction = contraction_mapping.get(match)\
                              if contraction_mapping.get(match)\
                              else contraction_mapping.get(match.lower())                       
      expanded_contraction = first_char+expanded_contraction[1:]
      return expanded_contraction
      
  expanded_text = contractions_pattern.sub(expand_match, text)
  # expanded_ text = re.sub("'", "", expanded_text)
  return expanded_text

# remove any character except alphabets
def remove_special_characters(text, remove_digits=False):
  pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
  text = re.sub(pattern, '', text)
  return text

# lemmatization
def lemmatize_text(text):
  text = nlp(text)
  text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

# remove stopwords 
def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
      filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
      filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)    
  return filtered_text

def normalize_corpus(corpus, contraction_expansion=False,
                     accented_char_removal=False, text_lower_case=True, 
                     text_lemmatization=False, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True, 
                     remove_brac_content=True, remove_urls=True):
    
  normalized_corpus = []
  # normalize each document in the corpus
  for doc in tqdm(corpus, total=len(corpus)):
  # for doc in corpus:
    # remove accented characters
    if accented_char_removal:
      doc = remove_accented_chars(doc)
    # expand contractions    
    if contraction_expansion:
      doc = expand_contractions(doc)
    # lowercase the text    
    if text_lower_case:
      doc = doc.lower()
    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
    # lemmatize text
    if text_lemmatization:
      doc = lemmatize_text(doc)
    if remove_urls:
      doc = remove_hyperlinks(doc)
    # remove special characters and\or digits   
    if remove_brac_content:
      doc = remove_text_within_brackets(doc) 
    if special_char_removal:
      # insert spaces between special characters to isolate them    
      special_char_pattern = re.compile(r'([{.(-)!}])')
      doc = special_char_pattern.sub(" \\1 ", doc)
      doc = remove_special_characters(doc, remove_digits=remove_digits)  
    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)
    # remove stopwords
    if stopword_removal:
      doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        
    normalized_corpus.append(doc)
      
  return normalized_corpus

In [None]:
# replace punctuations 
main_df['text_a'] = main_df.progress_apply(
    lambda row: replace_content(row['text_a']),
    axis=1
)
main_df['text_b'] = main_df.progress_apply(
    lambda row: replace_content(row['text_b']),
    axis=1
)

  0%|          | 0/4060 [00:00<?, ?it/s]

  0%|          | 0/4060 [00:00<?, ?it/s]

In [None]:
# find this in the corpus
# For 18 years, Merkel had led the party, opening it up to the left and causing it to take on, in an adapted form, many ideas previously considered the realm of the Social Democrats (SPD). Read more: AfD: What you need to know about Germany\'s far-right party \'AKK\'s achievement\' In the beginning, AKK tried to harness the conservative wing — without success. AKK, as she is also known, said she would also be stepping down as party leader — for, as she also announced publicly a couple of hours later, "in my opinion, the party leader and its candidate for chancellor should be one and the same person." On the contrary, it became increasingly clear under AKK that the CDU was deeply divided: One wing sought a return to quintessentially conservative politics, and one, like Angela Merkel, preferred to focus on the socio-liberal center ground. AKK has been criticized from within her own party for showing a lack of leadership It was already clear from the internal party debate that the CDU was at a crossroads. For one,

In [None]:
# apply normalization to the corpus
main_df['text_a'] = normalize_corpus(main_df['text_a'].to_list())
main_df['text_b'] = normalize_corpus(main_df['text_b'].to_list())

  0%|          | 0/4060 [00:00<?, ?it/s]

  0%|          | 0/4060 [00:00<?, ?it/s]

In [None]:
main_df.to_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/text_rank_data/Longform_textrank_adv/final_data_preproc.csv', index=False)

-----
#### <u>FNC-1 Dataset</u>
to load fnc-1 dataset, use the one uploaded on huggingface.co🤗  [link](https://huggingface.co/datasets/nid989/FNC-1)
* model name: nid989/FNC-1

In [None]:
# APPLY TEXTRANK TO FNC_1 DATA

In [None]:
# data_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/FNC_1/train_bodies.csv')
# # rename columns according to the method
# data_df.rename(columns={
#     "Body ID": "id",
#     "articleBody": "text",
# }, inplace=True, errors="raise")
# print(f"shape: {data_df.shape}")
# # change type to string.
# data_df['id'] = data_df['id'].astype('str')
# # retrieve sentences
# article_sentences = generate_sentences(data_df)
# article_sentences = modifiy_gen_sentences(article_sentences)

# # apply textrank to article sentences
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/FNC_1"
# processed_dict, faulty_unique_ids = process_batch(article_sentences, word_embeddings, dim=100, file_name="fnc_ranked_sentences_data.json")

# # save data
# to_json(save_location=path_to_dir, file_name="fnc_ranked_sentences_data.json", dict_=processed_dict) # processed dictionary
# to_json(save_location=path_to_dir, file_name="fnc_article_sentences_data.json", dict_=article_sentences) # article sentences

# # join and concat ranked data using a topk extraction
# ranked_data = [{'idx': key, 'text': " ".join(extract_top_hyp(value['text'], tokenizer))} for key, value in tqdm(processed_dict.items(), total=len(processed_dict.keys()))]
# processed_df = pd.DataFrame(ranked_data, columns=['idx', 'text'])       # processed & ranked data
# # rename processed data
# processed_df.rename(columns={
#     'idx': 'Body ID',
#     'text': 'articleBody'
# }, inplace=True, errors="raise")
# # change dtype of `Body ID` to int64
# processed_df['Body ID'] = processed_df['Body ID'].astype(int)
# # load train_stances data
# stance_df = pd.read_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/FNC_1/train_stances.csv')
# # merge both train_stances and processed train_bodies data
# fnc_data = pd.merge(stance_df, processed_df, on='Body ID', how='inner')
# # apply one-hot encoding to Stance classes
# le = preprocessing.LabelEncoder()
# le.fit(fnc_data['Stance'])
# fnc_data['Stance'] = le.transform(fnc_data['Stance'])
# # reorder and shuffle data
# fnc_data = fnc_data[['Headline', 'articleBody', 'Stance', 'Body ID']]
# fnc_data = fnc_data.sample(frac=1).reset_index(drop=True)

# # save to dir.
# fnc_data.to_csv(f'{path_to_dir}/fnc_1.csv', index=False)

------
#### <u> HYPERPARTISAN DATASET </u>

In [None]:
# path_to_file = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/sample.csv"
# data_df = pd.read_csv(path_to_file)
# # concat preprocessed title and text
# data_df['main_text'] = data_df.progress_apply(
#     lambda row: row['preproc_title'] + "." + row['preproc_text'],
#     axis=1
# )
# # select 15000 samples
# data_df = data_df.sample(15000)
# # drop unnecessary columns
# data_df.drop(['title', 'text', 'preproc_title', 'preproc_text', 'idx'], axis=1, inplace=True)
# data_df['id'] = np.arange(data_df.shape[0])
# data_df = data_df[['id', 'main_text', 'hyperpartisan', 'bias']]
# # rename columns according to the method
# data_df.rename(columns={
#     "main_text": "text",
# }, inplace=True, errors="raise")

# print(f"shape: {data_df.shape}")
# # change type to string.
# data_df['id'] = data_df['id'].astype('str')
# # retrieve sentences
# article_sentences = generate_sentences(data_df)
# article_sentences = modifiy_gen_sentences(article_sentences)

# # apply textrank to article sentences
# path_to_dir = "/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/"
# processed_dict, faulty_unique_ids = process_batch(article_sentences, word_embeddings, dim=300, fasttext=True, file_name="pan_ranked_sentences_data.json")

# # save data
# data_df.to_csv(f"{path_to_dir}sampled_data.csv", index=False)
# to_json(save_location=path_to_dir, file_name="pan_ranked_sentences_data.json", dict_=processed_dict) # processed dictionary
# to_json(save_location=path_to_dir, file_name="pan_article_sentences_data.json", dict_=article_sentences) # article sentences

# # get saved processed dict
# processed_dict = get_json(f"{path_to_dir}pan_ranked_sentences_data.json")

# sorted_processed_dict = dict()

# for key in tqdm(processed_dict.keys(), total=len(processed_dict.keys())):
#   text = processed_dict[key]['text']
#   scores = processed_dict[key]['score']
#   sorted_text_list = sorted(((scores[i],s) for i,s in enumerate(text)), reverse=True)
#   ranked_scores = [score for score, _ in sorted_text_list]
#   ranked_texts = [text for _, text in sorted_text_list]
#   sorted_processed_dict[key] = {
#       'score': ranked_scores,
#       'text': ranked_texts
#   }

# # join and concat ranked data using a topk extraction
# ranked_data = [{'idx': key, 'text': " ".join(extract_top_hyp(value['text'], tokenizer))} for key, value in tqdm(sorted_processed_dict.items(), total=len(sorted_processed_dict.keys()))]
# processed_df = pd.DataFrame(ranked_data, columns=['idx', 'text'])       # processed & ranked data
# # change dtype of `idx` to int64
# processed_df['idx'] = processed_df['idx'].astype(int)
# # save processed & concatenated data
# processed_df.to_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/final_pan_data.csv', index=False)

# # extract processed data
# pan_data = pd.read_csv("/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/final_pan_data.csv")

# data_df_copy = data_df.copy(deep = True)
# data_df_copy.drop(['text'], axis=1, inplace=True)
# data_df_copy.rename(columns={'id': 'idx'}, inplace=True, errors="raise")
# data_df_copy['idx'] = data_df_copy['idx'].astype(int)

# merged_pan_data = pd.merge(pan_data, data_df_copy, on='idx', how='inner')
# merged_pan_data.to_csv('/content/drive/MyDrive/SemEval-Akash_Nidhir_Rishikesh/SemEval 2022 - Multilingual Document Similarity/Semeval-Task-8/dataset/external_data/hyperpartisan_dataset/Data/final_pan_data.csv', index=False)