In [None]:
#Importing Required Libraries
import pandas as pd
import math
from langdetect import detect
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

In [None]:
#Importing Springer Data
springer_meta = pd.read_csv("data/springer_data_meta.csv")
springer_text = pd.read_csv("data/springer_data_text.csv")

In [None]:
#Inspecting the Imported Data Frames
len(springer_meta)
len(springer_text)

In [None]:
springer_meta.head()

In [None]:
springer_text.head()

In [None]:
#Renaming Columns for Springer Text Data Frame
springer_text.rename(columns={'0': 'doi', '1': 'text'}, inplace=True)

In [None]:
springer_text.head()

In [None]:
#Joining Springer Text and Meta Data
springer_df = pd.merge(springer_meta, springer_text, on='doi')

In [None]:
#Inspecting Joined Data Frame
springer_df.head()

In [None]:
#Listing the Columns of the New Data Frame
springer_df.columns

In [None]:
#Importing Elsevier Data as Data Frames
elsevier_meta = pd.read_csv("data/elsevier_data_meta.csv")
elsevier_text = pd.read_csv("data/elsevier_data_text.csv")

In [None]:
#Insepcting Elsevier Data
elsevier_meta.head()

In [None]:
elsevier_meta.columns

In [None]:
elsevier_text.head()

In [None]:
#Joining Elsevier Text and Metadata
elsevier_df = pd.merge(elsevier_meta, elsevier_text, left_on='prism:doi', right_on='doi', how='left')

In [None]:
#Inspecting Merged Elsevier Data Frame
elsevier_df.head()

In [None]:
elsevier_df.columns

In [None]:
#Dropping the Repeated DOI Column
elsevier_df.drop('prism:doi', axis=1, inplace=True)

In [None]:
#Inspecting Resulting Data Frame
elsevier_df.head()

In [None]:
elsevier_df.columns

In [None]:
#Uploading Jstor Text Data
jstor_text = pd.read_json('data/jstor_data_text.jsonl', lines=True)

In [None]:
#Inspecting Jstor Text Data Frame
jstor_text.head()

In [None]:
jstor_text.columns

In [None]:
#Uploading Jstor Metadata
jstor_meta = pd.read_csv("data/jstor_metadata.csv")

In [None]:
#Inspecting Jstor Metadata
jstor_meta.head()

In [None]:
jstor_meta.columns

In [None]:
#Joining Jstor Text and Meta Data
jstor_df = pd.merge(jstor_meta, jstor_text, left_on='DOI', right_on='doi', how='left')

In [None]:
#Inspecting Jstor Merged Data Frame
jstor_df.head()

In [None]:
jstor_df.columns

In [None]:
#Dropping Duplicate DOI Column
jstor_df.drop('DOI', axis=1, inplace=True)

In [None]:
#Inspecting New Jstor Data Frame
jstor_df.columns

In [None]:
#Filtering Springer Data Frame for English Language
springer_df = springer_df[springer_df['language'] == 'en']

In [None]:
#Assigning a Language to Each Observation Through Language Detection on Titles
language = []
for t in elsevier_df['dc:title']:
    lang = detect(t)
    language.append(lang)

elsevier_df['language'] = language

In [None]:
#Filtering Elsevier Data Frame for English Language
elsevier_df = elsevier_df[elsevier_df['language'] == 'en']

In [None]:
#Filtering Jstor Data Frame for English Language
jstor_df = jstor_df[jstor_df['language_x'] == 'en']

In [None]:
#Extracting Doi, Author, Title, Text and Date from Springer Data Frame
springer_to_merge = springer_df[['doi','creators','title','text','publicationDate']]

In [None]:
#Renaming the Column Names for Consistency
springer_to_merge = springer_to_merge.rename(columns={'doi': 'doi', 'creators': 'authors', 'title': 'title', 'text':'text', "publicationDate":'date'})

In [None]:
#Extracting Required Columns from Elsevier Data Frame
elsevier_to_merge = elsevier_df[['doi','dc:creator','dc:title','full-text','prism:coverDate']]

In [None]:
#Renaming Columns for Consistency
elsevier_to_merge = elsevier_to_merge.rename(columns={'doi': 'doi', 'dc:creator': 'authors', 'dc:title': 'title', 'full-text':'text', "prism:coverDate":'date'})

In [None]:
#Extracting Relevant Columns from Jstor Data Frame
jstor_to_merge = jstor_df[['doi','author','title_x','fullText','datePublished']]

In [None]:
#Renaming Columns for Consistency
jstor_to_merge = jstor_to_merge.rename(columns={'doi': 'doi', 'author': 'authors', 'title_x': 'title', 'fullText':'text', "datePublished":'date'})

In [None]:
#Merging All the Data Frames
merged_df = pd.concat([springer_to_merge, elsevier_to_merge, jstor_to_merge], axis=0)

In [None]:
#Inspecting Merged Data Frame
len(merged_df)

In [None]:
#Checking for Duplicates
duplicates = merged_df.duplicated(subset=['doi'])
duplicates.value_counts()

In [None]:
#Dropping Duplicates
merged_df = merged_df.drop_duplicates(subset=['doi'])

In [None]:
#Dropping Any Missing Values
merged_df.dropna(inplace=True)

In [None]:
#Saving the Merged Data Frame to a CSV File
merged_df.to_csv("data/merged_df.csv", index=False)

In [None]:
######################################
#Making a Word Document for the Data##
######################################


#Create a new Word document
document = Document()

#Set font size and alignment for all paragraphs
style = document.styles['Normal']
font = style.font
font.size = Pt(12)
document.add_paragraph().add_run().add_break()

#Iterate over each row in the DataFrame and add the values to the document
for _, row in merged_df.iterrows():
    document.add_page_break()
    document.add_paragraph(str(row['date']))
    document.add_paragraph(str(row['doi']))
    document.add_paragraph(str(row['authors']))
    document.add_paragraph(str(row['title']))
    document.add_paragraph(str(row['text']))
    document.add_paragraph('\n\n')

#Save the document
document.save('data/posthuman_data.docx')