Take JSON files for each writeup, split into multiple sentences per writeup and put into one big doc but keep track of what document they're from

In [25]:
def split_and_update_indices(test_list, test_index, split_list):
    """
    Splits elements in a list based on a list of splitters and updates the index list accordingly.
    Useful below for keeping track of who said what from topic modelling whilst splitting longer responses.

    Parameters
    test_list : list
        List of strings to split.
    test_index : list
        List of indices corresponding to the strings in test_list.
    split_list : list
        List of strings to split on.

    Returns
    new_list : list
        List of strings split on the splitters.
    new_index : list
        List of indices corresponding to the strings in new_list.
    """

    assert len(test_list) == len(test_index), "Length of test_list and test_index must be equal."

    new_list = []
    new_index = []

    for element, index in zip(test_list, test_index):
        split_elements = [element]
        for splitter in split_list:
            temp_list = []
            for sub_element in split_elements:
                temp_list.extend(sub_element.split(splitter))
            split_elements = temp_list
        
        # Remove empty strings resulting from split
        split_elements = [elem for elem in split_elements if elem]

        new_list.extend(split_elements)
        new_index.extend([index] * len(split_elements))

    return new_list, new_index

# Example usage
Test_list = ["Sentence. New sentence and a bit more", "Something else", "More stuff and even more"]
Test_index = [0, 1, 2]
SplitList = [".", "and"]

New_list, New_index = split_and_update_indices(Test_list, Test_index, SplitList)

print(New_list)
print(New_index)

['Sentence', ' New sentence ', ' a bit more', 'Something else', 'More stuff ', ' even more']
[0, 0, 0, 1, 2, 2]


In [26]:
import json
import os
import pandas as pd
# For each JSON file in "writeups"
# Load the JSON file by 'content' key
# Split the content by the splitters
# Store into a list of strings and a list of indices
# Aggregate them all


doc_list = []


index = 1
for file in os.listdir("writeups"):
    with open("writeups/" + file) as f:
        print("Writeup:", index)
        index += 1
        content = json.load(f)["content"]
        # remove empty strings
        content = [elem for elem in content if elem]
        doc_list.append(content)

MetaDF = pd.read_csv("documents/Writeups.csv")
Dates = MetaDF["DateTime"].tolist()

print("Documents Length: ", len(doc_list))
print("Index Length: ", len(Dates))

Writeup: 1
Writeup: 2
Writeup: 3
Writeup: 4
Writeup: 5
Writeup: 6
Writeup: 7
Writeup: 8
Writeup: 9
Writeup: 10
Writeup: 11
Writeup: 12
Writeup: 13
Writeup: 14
Writeup: 15
Writeup: 16
Writeup: 17
Writeup: 18
Writeup: 19
Writeup: 20
Writeup: 21
Writeup: 22
Writeup: 23
Writeup: 24
Writeup: 25
Writeup: 26
Writeup: 27
Writeup: 28
Writeup: 29
Writeup: 30
Writeup: 31
Writeup: 32
Writeup: 33
Writeup: 34
Writeup: 35
Writeup: 36
Writeup: 37
Writeup: 38
Writeup: 39
Writeup: 40
Writeup: 41
Writeup: 42
Writeup: 43
Writeup: 44
Writeup: 45
Writeup: 46
Writeup: 47
Writeup: 48
Writeup: 49
Writeup: 50
Writeup: 51
Writeup: 52
Writeup: 53
Writeup: 54
Writeup: 55
Writeup: 56
Writeup: 57
Writeup: 58
Writeup: 59
Writeup: 60
Writeup: 61
Documents Length:  61
Index Length:  61


In [27]:
SplitList = [". ", ", ", "and ", "or ", ": ", "; ", " - "]
Doc_master = []
Date_master =[]

for doc, date in zip(doc_list, Dates):
    print("Writeup for: ", date)
    print(len(doc))
    New_docs, New_dates = split_and_update_indices(doc, [date]*len(doc), SplitList)
    print("Difference in length after splitting: ", len(New_docs) - len(doc))
    Doc_master.extend(New_docs)
    Date_master.extend(New_dates)


print("Documents Length: ", len(Doc_master))
print("Index Length: ", len(Date_master))

Writeup for:  2021-03-17
10
Difference in length after splitting:  86
Writeup for:  2021-03-31
32
Difference in length after splitting:  137
Writeup for:  2021-04-14
24
Difference in length after splitting:  143
Writeup for:  2021-04-28
7
Difference in length after splitting:  42
Writeup for:  2021-05-12
13
Difference in length after splitting:  60
Writeup for:  2021-05-26
40
Difference in length after splitting:  218
Writeup for:  2021-07-14
25
Difference in length after splitting:  166
Writeup for:  2021-07-24
32
Difference in length after splitting:  127
Writeup for:  2021-08-11
35
Difference in length after splitting:  153
Writeup for:  2021-08-25
34
Difference in length after splitting:  118
Writeup for:  2021-09-08
21
Difference in length after splitting:  124
Writeup for:  2021-10-06
1
Difference in length after splitting:  0
Writeup for:  2021-10-20
34
Difference in length after splitting:  189
Writeup for:  2021-11-03
43
Difference in length after splitting:  144
Writeup for: 

In [28]:
import os
if not os.path.exists('documents'):
    os.makedirs('documents')
# Save docs and indices
with open("documents/docs.txt", "w") as f:
    f.write("\n".join(Doc_master))
with open("documents/dates.txt", "w") as f:
    f.write("\n".join(map(str, Date_master)))