# Setting Up

In [None]:
# update your credentials if needed
!git config --global user.email "abduallahw10@gmail.com"
!git config --global user.name "Abdullah Al-Hayali"

In [None]:
# check that we're in the right repo, branch and that we are caught up
!git status

# Data Loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt

In [None]:
# Loading metadata

root_path = "C:\My files\Courses\CIS6050\Data"
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
# Entries for each column in the DF

meta_df.info()

## Fetch JSON Files

In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

In [None]:
all_json[:5]

In [None]:
all_json[0]

In [None]:
# File reader class

class FileReader:
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.author_info = []
            self.abstract = []
            self.body_text = []
            # Author
            for entry in content['metadata']['authors']:
                self.author_info.append(entry['first'])
                self.author_info.append(entry['last'])
#             print(self.author_info)
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
#             self.author_info = '\n'.join(self.author_info)
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'AUTHOR: {self.author_info}; PAPER ID: {self.paper_id}; ABSTRACT: {self.abstract[:50]}; BODY TEXT: {self.body_text[:50]}'

first_row = FileReader(all_json[2])
print(first_row)

In [None]:
# Filter non-complying JSONS

from tqdm import tqdm

all_json_clean = list()

for idx, content in tqdm(enumerate(all_json)):    
    try:
        content = FileReader(content)
    except Exception as e:
        continue  # invalid paper format, skip
    
    if len(content.abstract) == 0:
        continue
    
    all_json_clean.append(all_json[idx])
    
all_json = all_json_clean
len(all_json)

In [None]:
new_jj = all_json[:]
len(new_jj)

In [None]:
import os

In [None]:
# Return the longest prefix of all list elements.
def commonprefix(m):
    "Given a list of pathnames, returns the longest common leading component"
    if not m: return ''
    s1 = min(m)
    s2 = max(m)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

In [None]:
commonprefix(new_jj)

In [None]:
len(new_jj)

In [None]:
df_test22 = pd.DataFrame(new_jj)
df_test22.to_csv('Paths_JSON_clean.csv', index=False)

# Pushing The Code

In [None]:
!git status

In [None]:
!git add .

In [None]:
!git commit -m "Uploading data loaders"

In [None]:
!git status

In [None]:
!git push