- Alon Moses 308177815
- Guy Attia 305743437

# Import

In [None]:
import zipfile
from os import mkdir, path
from shutil import copyfile
import json as js
import lzma
import scipy as scp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

In [None]:
input_path = path.join('..', 'data')

# Preprocessing
This preprocessing functions executed offline due to the large datafiles needed to be loaded into the Google Colab server.

## Metadata File
- Unzip the zip file
- Load the original metadata csv file
- Extract the latest 20K papers ID's from the metadata
- Save the metadata of the 20K papers in a new csv file

In [None]:
with zipfile.ZipFile('metadata.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

df_metadata = pd.read_csv('metadata.csv')

In [None]:
df_metadata['publish_time'] = pd.to_datetime(df_metadata['publish_time'], format='%Y-%m-%d')
df_metadata.dropna(subset=['pdf_json_files'], inplace=True)
df_metadata.sort_values(by='publish_time', ascending=False, inplace=True)
df_metadata_20k = df_metadata.iloc[:20000]
df_metadata_20k = df_metadata_20k.loc[:, ['sha', 'pdf_json_files', 'publish_time']]
df_metadata_20k['file_path'] = df_metadata_20k['pdf_json_files'].str.split(';').str[0]
df_metadata_20k['first_sha'] = df_metadata_20k['sha'].str.split(';').str[0]
df_metadata_20k.to_csv('metadata_20k.csv', index=False)

## Text Files
- Unzip the zip file containing all the papers documents
- Create new clean folder to hold the relevant 20K papers
- Iterate over the 20K metadata file and copy the relevant files to the new folder

In [None]:
with zipfile.ZipFile('document_parses.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

mkdir('20k_papers')

for _, row in df_metadata_20k.iterrows():
  copyfile(src=row['file_path'], dst=path.join('20k_papers', f'{row["first_sha"]}.json'))

# Compression Function

In [None]:
def compress_str(text):
  text_in_bytes = bytes(text, 'utf-8')
  return lzma.compress(text_in_bytes)

## Papers representation
Represent each paper as an object holding its ID, Abstract text, and Body text.

The class contains also the compression function

In [None]:
class Paper:
    def __init__(self, paper_id=None, paper_metadata=None, file_path='20k_papers'):
      # Get paper-id
      if paper_id:
        self.paper_id = paper_id
      elif paper_metadata:
        self.paper_id = paper_metadata['paper_id'] 
      else:
        raise Exception('No paper id')
      
      # Extract info from json file
      with open(path.join(input_path, file_path, f'{paper_id}.json')) as file:
        content = js.load(file)
        self.bib_entries = content['bib_entries']
        self.abstract = []
        self.body_text = []
        # Abstract
        for entry in content['abstract']:
            self.abstract.append(entry['text'])
        # Body text
        for entry in content['body_text']:
            self.body_text.append(entry['text'])
        self.abstract = '\n'.join(self.abstract)
        self.body_text = '\n'.join(self.body_text)
        self.text_to_compress = f'{self.abstract}\n{self.body_text}'
      
      # Extract info from metadata file
      if paper_metadata:
        self.publish_time = paper_metadata['publish_time']
        self.journal = paper_metadata['journal']

    def compress_text(self):
      self.compressed_text = compress_str(self.text_to_compress)
      return self.compressed_text

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

In [None]:
# For example:
example_paper = Paper(paper_id='0a0befc62d8c3da285acc99c45f614dbdccaad10')
example_paper.compress_text()
print(example_paper)

## Distance function
Function that calculates Normalized Compression Distance (NCD)

In [None]:
def ncd(paper1_obj, paper2_obj):
  both_papers_text_to_compress = paper1_obj.text_to_compress + paper2_obj.text_to_compress # the concatenation of papers texts

  paper1_comp = paper1_obj.compress_text()  # compress paper 1
  paper2_comp = paper2_obj.compress_text()  # compress paper 2
  both_papers_comp = compress_str(both_papers_text_to_compress)  # compress papers concatenated

  ncd = (len(both_papers_comp) - min(len(paper1_comp), len(paper2_comp))) / \
      max(len(paper1_comp), len(paper2_comp))
  return ncd

# For example:
example_paper1 = Paper(paper_id='00a1a252ed0905f37c5c4e0526a3ea448daf5cbf')
example_paper2 = Paper(paper_id='00a1e7d4d8608d301d802f413ccb3804edcb60ed')
ncd(example_paper1, example_paper2)

# ML - Solution

## Similarity by Journal
Our hypothesis is that compressed papers from the same journal will be relatively closer to each other rather than compressed papers from different journals.
To assess our hypothesis we will do the following:
- Filter papers from specific journals
- Calculate the distances of the papers within the same journals (for each one)
- Calculate the distances of the papers from different journals (for each combination)
- Compare and analyze the distances results

### Filter Papers
Due to the long running times and the unnecessary need of comparing the entire papers combinations, we filtered the dataset by picking only papers from 10 specific journals containing around 27 papers each.

In [None]:
# Extract the 10 relevant journal names
rel_10_journals = df_metadata_20k['journal'].value_counts().head(100).tail(10).index.values
print('Journals we are going to test:')
display(df_metadata_20k['journal'].value_counts().head(100).tail(10))

# Keep the metadata only of the papers from these 10 journals
mask = df_metadata_20k['journal'].isin(rel_10_journals)
rel_cols = ['sha', 'journal', 'publish_time', 'first_sha']
df_metadata_10_journals = df_metadata_20k.loc[mask, rel_cols]
df_metadata_10_journals.reset_index(inplace=True)
df_metadata_10_journals.rename(columns={'index': 'paper_index'}, inplace=True)
display(df_metadata_10_journals.head(2))

In [None]:
# Copy only the papers from the 10 relevant journals for easier upload to colab
# ** Should be used offline only

def copy_10_journals_files():
    if not path.isdir(path.join(input_path, '10_journals')):
        mkdir(path.join(input_path, '10_journals'))
    for s in df_metadata_10_journals['first_sha']:
        if not path.isfile(path.join(input_path, '10_journals', f'{s}.json')):
            copyfile(src=path.join(input_path, '20k_papers', f'{s}.json'), 
                     dst=path.join(input_path, '10_journals', f'{s}.json'))

copy_10_journals_files()

### Distance Between 2 Papers

In [None]:
sha_lookup_table = {pi: fs for pi, fs in df_metadata_10_journals[['paper_index', 'first_sha']].values}

In [None]:
def calc_2_papers_dist(paper_index_1, paper_index_2):
    if paper_index_1 == paper_index_2:
        return 0

    paper_id_1 = sha_lookup_table[paper_index_1[0]]
    paper_id_2 = sha_lookup_table[paper_index_2[0]]
    try:
        paper_obj_1 = Paper(paper_id=paper_id_1, file_path='10_journals')
        paper_obj_2 = Paper(paper_id=paper_id_2, file_path='10_journals')
    except:
        return 0
    return ncd(paper_obj_1, paper_obj_2)

### Similarity in the same journal
Find the distance between the papers of the same journal for each one of the relevant journals.

In [None]:
def calc_distances_in_journal(df_jour):
    """
    Calculate a distances matrix for a specific journal. 
    It will hold the distance between every paper in the specific journal
    """
    # Extract the ids of the papers from the specified journal
    papers_ids = df_jour['paper_index'].values
    papers_ids_array = np.array(papers_ids.reshape(-1, 1))
    
    # Calc the pairwise distances
    df_dist = pd.DataFrame(pairwise_distances(papers_ids_array, metric=calc_2_papers_dist),
                           columns=papers_ids.reshape(-1), index=papers_ids.reshape(-1))
    
    # Remove duplicates distances due to the squared of the matrix
    lower_triu_mask = ~np.triu(np.ones(df_dist.shape)).astype(bool)
    df_dist = df_dist.where(lower_triu_mask)
    df_dist = df_dist.stack().reset_index()
    df_dist.columns = ['paper1_index','paper2_index','distance']
    return df_dist

In [None]:
# Iterate over the 10 journals and find their internal distances
journals_dist_dict = {}

for jour in rel_10_journals:
    mask = df_metadata_10_journals['journal'] == jour
    df_jour = df_metadata_10_journals.loc[mask]
    df_dist = calc_distances_in_journal(df_jour)
    journals_dist_dict[jour] = df_dist

### Similarity between different journals
Find the distance between the papers of different journals for each combination of the relevant journals.

In [None]:
def calc_distances_between_journal(df_jour1, df_jour2):
    papers_ids1 = df_jour1['paper_index'].values.reshape(-1, 1)
    papers_ids2 = df_jour2['paper_index'].values.reshape(-1, 1)
    
    dist_matrix = pairwise_distances(X=papers_ids1, Y=papers_ids2, metric=calc_2_papers_dist)
    df_dist = pd.DataFrame(dist_matrix, columns=papers_ids2.reshape(-1), index=papers_ids1.reshape(-1))
    df_dist = df_dist.stack().reset_index()
    df_dist.columns = ['paper1_index','paper2_index','distance']
    return df_dist

In [None]:
between_journals_dist_dict = {}
for jour1 in top_10_journals:
    for jour2 in top_10_journals:
        if jour1 != jour2:
            mask = df_metadata_10_journals['journal'] == jour1
            df_jour1 = df_metadata_10_journals.loc[mask]
            mask = df_metadata_10_journals['journal'] == jour2
            df_jour2 = df_metadata_10_journals.loc[mask]
            df_dist = calc_distances_between_journal(df_jour1, df_jour2)
            between_journals_dist_dict[(jour1, jour2)] = df_dist

### Results Analysis
Analyze the distances results by ploting the distances distributions and the differences between in-journal and between-journal distances

In [None]:
for jour_name, jour_dist_df in journals_dist_dict.items():
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
    jour_dist_df['distance'].plot.hist(ax=ax1)
    ax1.set_title(f'Distance histogram for journal: {jour_name}')
    
    jour_dist_df['distance'].plot.kde(ax=ax2)
    ax2.set_title(f'Distance KDE for journal: {jour_name}')
    plt.show()

In [None]:
for jour_name_tuple, jour_dist_df in between_journals_dist_dict.items():
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
    jour_dist_df['distance'].plot.hist(ax=ax1)
    ax1.set_title(f'Distance histogram between journals: {jour_name_tuple[0]} and {jour_name_tuple[1]}')
    
    jour_dist_df['distance'].plot.kde(ax=ax2)
    ax2.set_title(f'Distance KDE between journals: {jour_name_tuple[0]} and {jour_name_tuple[1]}')
    plt.show()

In [None]:
between_journal_means = []
for _, jour_dist_df in between_journals_dist_dict.items():
    between_journal_means.append(jour_dist_df['distance'].mean())
    
in_journal_means = []
for _, jour_dist_df in journals_dist_dict.items():
    in_journal_means.append(jour_dist_df['distance'].mean())
    
my_dict = {'in_journal_means': in_journal_means, 
           'between_journal_means': between_journal_means}

_, p_value = scp.stats.ttest_ind(a = in_journal_means, b = between_journal_means)

fig, ax = plt.subplots(figsize=(14, 7))
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.title(f'Distance means for in & between journal papers (T-test p-value = {p_value:.5f})')
plt.show()

<b>As we assumed, it's easy to see that the average distance between papers from the same journal is significant (p_value < 0.05) lower than distance between papers from different journals.</b>

### Similarity by joint bib references
Find the distance between the papers which reference to same bibliography.

- Create new bib_entries dictionary which contains for each of the files a list of corresponding files in the dataset. The files in this list will follow the below assumptions-
  - The publish time distance between the files will not exceed 100 days.
  - The files will have at least one common bibliography entry (reference tto other paper, story, etc.)

** We limit the number of files in each file's similarity list to 1000 to due to runtime requirements.

In [None]:
compressed_dict = {}
caclculate_distance_dict = {}

for _, row in df_metadata_20k.iterrows():
    paper_id = row['first_sha']
    paper = Paper(paper_id = paper_id)
    compressed_dict[paper_id] = {}
    caclculate_distance_dict[paper_id] = []
    
    paper.publish_time = row['publish_time']
    compressed_dict[paper_id]['publish_time'] = row['publish_time']

    bib_list = []
    for bib_ref in paper.bib_entries:
        bib_list.append(paper.bib_entries[bib_ref]['title'])
    compressed_dict[paper_id]['bib_entries'] = bib_list
    
    i = 0
    for compare_paper_id in compressed_dict:
        if abs(paper.publish_time.day - compressed_dict[compare_paper_id]['publish_time'].day) > 100: continue
        i += 1
        if compare_paper_id != paper_id:
            bib_list2 = compressed_dict[compare_paper_id]['bib_entries']
            intersect = [bib_ref for bib_ref in bib_list if bib_ref in bib_list2]
            if len(intersect) > 0:
                caclculate_distance_dict[paper_id].append(compare_paper_id)
        if i % 1000 == 0:
            break

In [None]:
with open('bib_relations.json', 'w') as f:
    f.write(json.dumps(caclculate_distance_dict))

## Read bib_relations.json

In [None]:
with open('bib_relations.json') as f:
    related_files_dict = js.loads(f.read())

##### Create dictionary with 'file_sha's used as keys, pointing to a list of distances values between the paper in the key and the papers appeared in the bib_reference.json lists.

In [None]:
def create_distance_dict(files_dict, low_bound_of_joint, high_bound_of_joint):
    distance_dict = {}
    i = 0
    for file_1_sha in files_dict:
        paper_1_obj = Paper(paper_id=file_1_sha)
        if len(files_dict[file_1_sha]) < low_bound_of_joint or len(files_dict[file_1_sha]) > high_bound_of_joint: continue
        i += 1
        file_1_distances = []
        for file_2_sha in files_dict[file_1_sha]:
            paper_2_obj = Paper(paper_id=file_2_sha)
            distance = ncd(paper_1_obj, paper_2_obj)
            file_1_distances.append(distance)
            if len(file_1_distances) > 50: break
        distance_dict[file_1_sha] = file_1_distances
        if len(distance_dict) > 10: break
    return distance_dict

In [None]:
number_of_joint_bib_refs = [20, 30] # We filter by papers with 20 to 30 similar bib references.
for i in range(len(number_of_joint_bib_refs)-1):
    joined_bib_refs_distance_dict = create_distance_dict(related_files_dict, 
                                                         low_bound_of_joint=number_of_joint_bib_refs[i],
                                                         high_bound_of_joint=number_of_joint_bib_refs[i+1])
    with open(f'joint_bib_distances_{number_of_joint_bib_refs[i]}_{number_of_joint_bib_refs[i+1]}.json', 'w') as f:
        f.write(js.dumps(joined_bib_refs_distance_dict))    

##### For the same files used for calculating distance above, calculate their distance with files which have no joint bib reference.

In [None]:
def create_disjoint_distance_dict(files_dict, joined_bib_refs_distance_dict):
    distance_dict = {}
    for file_1_sha in joined_bib_refs_distance_dict:
        file_1_distances = []
        paper_1_obj = Paper(paper_id=file_1_sha)
        for file_2_sha in files_dict:
            if file_2_sha not in files_dict[file_1_sha]:
                paper_2_obj = Paper(paper_id=file_2_sha)
                distance = ncd(paper_1_obj, paper_2_obj)
                file_1_distances.append(distance)
                if len(file_1_distances) > 50: break
        distance_dict[file_1_sha] = file_1_distances
    return distance_dict

In [None]:
disjoint_bib_refs_distance_dict = create_disjoint_distance_dict(related_files_dict, joined_bib_refs_distance_dict)
with open('disjoint_bib_distance.json', 'w') as f:
    f.write(js.dumps(disjoint_bib_refs_distance_dict))

# Plot results

In [None]:
joint_bib_reference_means = []
disjoint_bib_reference_means = []

for i, (file_sha, distances) in enumerate(joined_bib_refs_distance_dict.items()):
    plt.figure(figsize=(16, 4))
    
    mean_of_joint_bib_distances = sum(distances)/len(distances)
    joint_bib_reference_means.append(mean_of_joint_bib_distances)
    plt.hist(distances, density=True, bins=30, label = f'Mean of distances of joint bib references = {mean_of_joint_bib_distances}')
    
    mean_of_disjoint_bib_distances = sum(disjoint_bib_refs_distance_dict[file_sha])/len(disjoint_bib_refs_distance_dict[file_sha])
    disjoint_bib_reference_means.append(mean_of_disjoint_bib_distances)
    plt.hist(disjoint_bib_refs_distance_dict[file_sha], density=True, bins=30,label = f'Mean of distances of dis-joint bib references = {mean_of_disjoint_bib_distances}')
    
    if i == 9: break
    
    plt.legend()
    plt.show()

In [None]:
my_dict = {'joint_bib_papers': joint_bib_reference_means, 
           'disjoint_bib_papers': disjoint_bib_reference_means}

_, p_value = scp.stats.ttest_ind(a = in_journal_means, b = between_journal_means)

fig, ax = plt.subplots(figsize=(14, 7))
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.title(f'Distance means between papers with joint bib reference & disjoint bib reference (T-test p-value = {p_value:.5f})')
plt.show()

<b>In this case, we see that the average distance between papers which reference to similar bibliography isn't significant (p_value >> 0.05).
But we still can observe some lower distance between papers which reference to similar biblio graphy compared to ones doesn't have any joint bib references</b>