# Read raw text corpora

In [1]:
import os
import chardet
import magic 
import pandas as pd
from typing import Tuple, List, Iterable

def read_raw_corpus(files: Iterable[str], corpus: str) -> Tuple[pd.DataFrame, List[str]]:
    """
    Reads all files from file and creates a dataframe containing:
        * Raw text
        * title
        * year of publication
        * period label
        * genre label
        * region label
        * corpus label (has to set as param)
    
    NOTE: The Corpus label must be either 'innsbruck' or 'manchester'.
    If it is 'manchester' another split is used to read the metadata from the filenames.
    
    Also a list of tuples containing the files which could not be processed 
    and the error which was thrown while trying to do so is returned.
    
    :param file Liste of filepaths
    :param corpus corpus label string
    :return a dataframe + a list files which could not be processed
    """
    
    data = []
    errors = []
    for file in files:
    
        m_file = magic.Magic(mime_encoding=True)
        file_encoding = m_file.from_buffer(file)
        file = file.encode(file_encoding).decode('UTF-8')
        filename = os.path.basename(file)
        genre, period, region, year, title = filename.split('_')[:5]
        if corpus.lower() == 'manchester':
            if title.endswith('RAW.txt'):
                title = title.replace('RAW.txt', '')
            else:
                title = title.replace('.txt', '')

        with open(file, 'rb') as f:
            text_bytes = f.read()
            try:
                encoding_info = chardet.detect(text_bytes)
                text = text_bytes.decode(encoding_info['encoding'])
            except Exception as e:
                errors.append((file, str(e)))
                continue

        data.append({
            'filename': file,
            'text': text,
            'title': title,
            'genre': genre,
            'period': period,
            'region': region,
            'year': year,
            'filename': filename,
            'corpus': corpus
        })
        
    df = pd.DataFrame.from_dict(data)
    df = df.set_index('filename')
    return df, errors



ImportError: failed to find libmagic.  Check your installation

# Clean Dataset

# Read raw innsbruck corpus files into a dataframe

In [None]:
import glob

innsbruck_raw_files = glob.glob('GermInnC/**/*.txt')
print(len(innsbruck_raw_files))

In [None]:
innsbruck_raw_df, errors = read_raw_corpus(innsbruck_raw_files, 'innsbruck')
print(errors)
print(innsbruck_raw_df.shape)
innsbruck_raw_df.head()

In [None]:
innsbruck_raw_df.to_csv('innsbruck_dataset.csv', encoding='UTF-8')

# Read raw manchester corpus files into a dataframe

In [None]:
manchester_raw_files = glob.glob('Manchester/RAW/*.txt')

In [None]:
manchester_raw_df, errors = read_raw_corpus(manchester_raw_files, 'manchester')
print(errors)
print(manchester_raw_df.shape)
manchester_raw_df.head()

In [None]:
manchester_raw_df.to_csv('manchester_dataset.csv', encoding='UTF-8')

# Concat raw dataframes

In [None]:
full_raw_df = pd.concat((manchester_raw_df, innsbruck_raw_df))
print(full_raw_df.shape)
df.head()

In [None]:
#full_raw_df.to_csv('full_dataset.csv', encoding='UTF-8')

# Read tagged corpora

In [None]:
tagged_files = glob.glob('GermInnC Tagged/**/*.txt')

In [None]:
import os
from typing import Iterable
tagged_data = []
for file in tagged_files:
    genre, period, region, year, title = os.path.basename(file).split('_')[:5]
    tokens = []
    pos_tags = []
    lemmas = []
    with open(file, 'r', encoding='UTF-8') as f:
        for line in f:
            line = line.rstrip()
            try:
                token, pos_tag, lemma = line.split('\t')
            except ValueError:
                continue
            tokens.append(token)
            pos_tags.append(pos_tag)
            lemmas.append(lemma)

    tagged_data.append({
        'tokens': " ".join(tokens),
        'pos_tags': " ".join(pos_tags),
        'lemmas': " ".join(lemmas),
        'title': title,
        'genre': genre,
        'period': period,
        'region': region,
        'year': year,
        'corpus': 'innsbruck'
    })

In [None]:
tagged_data[0]

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(tagged_data)
df.to_csv('tagged_dataset.csv', encoding='UTF-8')

# Tagged Manchester

In [None]:
tagged_files_2 = glob.glob('Manchester/LING-COL/*.txt')

In [None]:
import os
from typing import Iterable
tagged_data_2 = []
for file in tagged_files_2:
    genre, period, region, year, title = os.path.basename(file).split('_')[:5]
    tokens = []
    norms = []
    pos_tags = []
    lemmas = []
    with open(file, 'r', encoding='UTF-8') as f:
        for line in f:
            line = line.rstrip()
            try:
                pID, token, norm, pos_tag, lemma, moph, p, rest = line.split('\t')
                #token, pos_tag, lemma = line.split('\t')
            except ValueError:
                continue
            
            tokens.append(token)
            norms.append(norm)
            pos_tags.append(pos_tag)
            lemmas.append(lemma)

    tagged_data_2.append({
        'tokens': " ".join(tokens),
        #'norm': " ".join(norms),
        'pos_tags': " ".join(pos_tags),
        'lemmas': " ".join(lemmas),
        'title': title,
        'genre': genre,
        'period': period,
        'region': region,
        'year': year,
        'corpus': 'manchester'
    })

In [None]:
tagged_data_2[0]

# Create the Manchester Tagged Dataframe and .csv

In [None]:
import pandas as pd
tagged_man_df = pd.DataFrame.from_dict(tagged_data_2)
tagged_man_df.shape

In [None]:
tagged_man_df.to_csv('tagged_Mancheser_dataset.csv', encoding='UTF-8')

# Concat the tagged Datasets

In [None]:
full_tagged_df = pd.concat((tagged_man_df, df))
print(full_tagged_df.shape)
full_tagged_df

In [None]:
full_tagged_df.to_csv('full_taggeddataset.csv', encoding='UTF-8')

# Clean Datasets

In [None]:
import pandas as pd
import numpy as np
#from sklearn.utils import shuffle

tagged_df = pd.read_csv('full_taggeddataset.csv')
full_raw_df = pd.read_csv('full_dataset.csv')

In [None]:
# Clean Dataset RAW
full_raw_df[full_raw_df.genre == "NEWS-P4"] = "NEWS"
full_raw_df.region = full_raw_df.region.str.upper()
full_raw_df = full_raw_df[full_raw_df.year != 'GesetzsammlungThÅringen']
full_raw_df = full_raw_df[full_raw_df.year != 'GesetzsammlungThüringen']
full_raw_df = full_raw_df[full_raw_df.year != '1851-54']
full_raw_df['text'] = full_raw_df['text'].str.replace('+',' ')
full_raw_df['text'] = full_raw_df['text'].str.replace('seyn','sein')
full_raw_df['text'] = full_raw_df['text'].str.replace('d','die')
full_raw_df['text'] = full_raw_df['text'].str.replace('er|es|sie','sich')
# Clean Dataset Tagged
tagged_df[tagged_df.genre == "NEWS-P4"] = "NEWS"
tagged_df.region = tagged_df.region.str.upper()
tagged_df = tagged_df[tagged_df.year != 'GesetzsammlungThÅringen']
tagged_df = tagged_df[tagged_df.year != 'GesetzsammlungThüringen']
tagged_df = tagged_df[tagged_df.year != '1851-54']
tagged_df['lemmas'] =tagged_df['lemmas'].str.replace('seyn','sein')
tagged_df['lemmas'] =tagged_df['lemmas'].str.replace('+',' ')
tagged_df['lemmas'] =tagged_df['lemmas'].str.replace(' d ','die')
tagged_df['lemmas'] =tagged_df['lemmas'].str.replace('er|es|sie','sich')
#tagged_df = tagged_df[tagged_df.text.replace('seyn', 'sein')]



In [None]:
full_raw_df.to_csv('full_dataset.csv', encoding='UTF-8')
tagged_df.to_csv('full_taggeddataset.csv', encoding='UTF-8')