## Imports

In [62]:
import pandas as pd
import os
import glob

## Configuration
*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: `path/to/texts/`.

*dataframe_filename:* The filename for the resulting pandas DataFrame. You may use the **.p** extension indicating a pickled file, but you are free to use whatever you like. Just make sure this is consistent in the subsequent sentiment analysis step.

In [63]:
input_dir = "texts/mimotext/"
dataframe_filename = "texts_mimotext.p"

## Directory Setup (Optional)
Creates directories according to the configuration if not already created manually.

In [64]:
if not os.path.exists(input_dir):
    os.makedirs(input_dir)

## Data Preparation

### Load texts

In [65]:
text_file_names = glob.glob("{}*.txt".format(input_dir))
print("found {} texts".format(len(text_file_names)))
texts = []
for text_file_name in text_file_names:
    if "\\" in text_file_name:
        corrected_filename = text_file_name.split("\\")[-1]
    elif "/" in text_file_name:
        corrected_filename = text_file_name.split("/")[-1]
    with open(text_file_name, "r", encoding="utf-8") as input_file:
        texts.append([corrected_filename, input_file.read()])
print("loaded {} texts".format(len(texts)))

found 3 texts
loaded 3 texts


### Create DataFrame

In [66]:
print("searching files for attributes and text")
prepared_texts = []
num_attributes = 0
for filename, text in texts:
    lines = text.split("\n")
    prepared_text = {"filename": filename}
    cur_line = 0
    for line in lines:
        line_type, line_content = line.split("=")[:2]
        if line_type != "text":
            try:
                line_content = float(line_content)
            except ValueError:
                pass
            prepared_text.update({line_type: line_content})
        else:
            break
        cur_line += 1
    num_attributes = max(num_attributes, cur_line)
    prepared_text.update({"text": " ".join(lines[cur_line:])[5:]})
    prepared_texts.append(prepared_text)

print("found {} additional attributes in .txt files".format(num_attributes))

texts_df = pd.DataFrame(prepared_texts)
texts_df.set_index("filename", inplace=True)

searching files for attributes and text
found 2 additional attributes in .txt files


In [67]:
texts_df

Unnamed: 0_level_0,﻿year,title,text
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abbes_Voyage.txt,1758.0,Voyage dans les espaces,VOYAGE DANS LES ESPACES. CHAPITRE PREMIER. LE ...
Anonym_Suzon.txt,1778.0,"Mémoires de Suzon, soeur de Dom Bougre",PRÉFACE.Ces Mémoires n'auraient jamais vu le j...
Voltaire_Candide.txt,1759.0,"Candide, ou l'Optimisme",Comment Candide fut élevé dans un beau château...


### Save DataFrame

In [69]:
texts_df.to_pickle(dataframe_filename)

# Reference

Koncar, P., Druml, L., Ertler, K.-D., Fuchs, A., Geiger, B. C., Glatz, C., Hobisch, E., Mayer, P., Saric, S., Scholger, M. & Voelkl, Y. (2021) A Sentiment Tool Chain for Languages of the 18th Century. https://github.com/philkon/sentiment-tool-chai
