In [1]:
import pandas as pd

CHUNK_SIZE = 512

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [3]:
from multiprocessing import set_start_method, Pool
from tqdm.notebook import tqdm
import multiprocessing as mp
print(mp.cpu_count())
#set_start_method('forkserver')

def split_data(row):
    eid, values = row
    input_ids = tokenizer(values.text).input_ids
    chunked = [input_ids[chunk: chunk + CHUNK_SIZE] for chunk in range(0, len(input_ids), CHUNK_SIZE)]
    decoded_chunked = tokenizer.batch_decode(chunked)
    return pd.DataFrame({'id': [eid]*len(chunked),
                         'pretokenized_text': chunked,
                         'decoded_text': decoded_chunked})
                         
def build_chunk_dataframe(text_data, metadata=None, cores=10):
    print("enter fn")
    with Pool(cores) as p:
        chunks = list(tqdm(p.imap_unordered(split_data, text_data.iterrows()),
                            total=len(text_data)))
        
        print("with pool(cores) as p")
    
    if metadata is not None:
        return pd.concat(chunks).merge(metadata, on='id')
    else:
        return pd.concat(chunks)

def clean_non_unique(data):
    nunique_ids = (data.id.value_counts() > 1)
    nunique_ids = nunique_ids[nunique_ids].index
    return data[data.id.isin(nunique_ids)]

8


# Blog data processing

In [None]:
print('Load data blog_as_csv.csv')
blog_corpus = pd.read_csv("data/nlp/blog_corpus/blog_as_csv.csv")

In [None]:
blog_corpus.text = blog_corpus.text.apply(lambda x: x.strip())
clean_blog_corpus = blog_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))
meta_blog_corpus = blog_corpus[['id', 'age', 'topic', 'gender']].groupby("id").agg(lambda x: list(x)[0])
full_blog_corpus = meta_blog_corpus.merge(clean_blog_corpus, on='id')
full_blog_corpus

In [None]:
chunked_blog_data = build_chunk_dataframe(full_blog_corpus, meta_blog_corpus)
nunique_blog_data = clean_non_unique(chunked_blog_data)
nunique_blog_data

In [None]:
nunique_blog_data.to_csv("data/nlp/blog_corpus/blog_as_csv_preprocessed.csv", index=False)

# Mail data processing

In [None]:
print('Load data mail_as_csv.csv')
mail_corpus = pd.read_csv("data/nlp/enron_mail_20150507/mail_as_csv.csv")
mail_corpus

In [None]:
import re
def clean_text(text):
    clean_mail = re.sub(r'(\\+r)?(\\+n)+', '\n', text)
    clean_mail = re.sub(r'\\+t', '\t', clean_mail)
    clean_mail = '\n'.join(clean_mail.strip().split('\n')[15:-1])
    clean_mail = re.sub(r'X-.+:.*\n', '<s>', clean_mail)
    clean_mail = re.sub(r'From:.*\n', '', clean_mail)
    clean_mail = re.sub(r"\\'", "'", clean_mail)

    return clean_mail

mail_corpus['clean_text'] = mail_corpus.text.apply(clean_text)

In [None]:
mail_corpus.columns = ['user', 'old_text', 'id', 'text']
mail_corpus.text = mail_corpus.text.apply(lambda x: x.strip())
clean_mail_corpus = mail_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))

chunked_mail_data = build_chunk_dataframe(clean_mail_corpus, None)
nunique_mail_data = clean_non_unique(chunked_mail_data)
nunique_mail_data

In [None]:
nunique_mail_data.to_csv("data/nlp/enron_mail_20150507/mail_as_csv_preprocessed.csv", index=False)

# Book data processing

In [4]:
print('Load data book_as_csv.csv')
book_corpus = pd.read_csv("data/nlp/gutenberg/book_as_csv.csv")
book_corpus

Load data book_as_csv.csv


Unnamed: 0,text,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,id_2
0,\n\n\n\nTHE HOUSE ON THE BORDERLAND\n\nWilliam...,PG10002,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],666,{'Science fiction'},book_0
1,\n\n\n\nThis file was produced from images gen...,PG10003,"My First Years as a Frenchwoman, 1876-1879","Waddington, Mary King",1833.0,1923.0,['en'],43,"{'France -- Social life and customs', 'France ...",book_1
2,and the Online Distributed Proofreading Team.\...,PG10004,The Warriors,"Lindsay, Anna Robertson Brown",1864.0,1948.0,['en'],27,{'Christianity'},book_2
3,and PG Distributed Proofreaders\n\n\n\n\n ...,PG10005,A Voyage to the Moon: With Some Account of the...,"Tucker, George",1775.0,1861.0,['en'],58,"{'Space flight to the moon -- Fiction', 'Scien...",book_3
4,\n\n\n\nLA FIAMMETTA\n\nBY\n\nGIOVANNI BOCCACC...,PG10006,La Fiammetta,"Boccaccio, Giovanni",1313.0,1375.0,['en'],43,{'Fiction'},book_4
...,...,...,...,...,...,...,...,...,...,...
39309,"\n\n\n\n\n\n\n\n\n\n""TIS SIXTY YEARS SINCE""\n\...",PG9996,"""'Tis Sixty Years Since"": Address of Charles F...","Adams, Charles Francis",1835.0,1915.0,['en'],12,"{'Philosophy, Modern'}",book_2030
39310,Distributed Proofreaders\n\n\n\n\n\n\n\n\n\n\n...,PG9997,"France and England in North America, Part III:...","Parkman, Francis",1823.0,1893.0,['en'],34,{'Canada -- History -- To 1763 (New France)'},book_90
39311,\n\n\n\nPOEMS\n\nBY\n\nMATILDA BETHAM.\n\n\n18...,PG9998,Poems,"Betham, Matilda",1776.0,1852.0,['en'],23,{'Poetry'},book_493
39312,\n\n\n\n\n\n\n\n\n\n[Illustration: Letter from...,PG9999,"Harriet, the Moses of Her People","Bradford, Sarah H. (Sarah Hopkins)",1818.0,1912.0,['en'],103,"{'Underground Railroad', 'African Americans --...",book_1687


In [5]:
import re
def clean_text(text):
    if type(text) == type("hi"):
        #print("hi")
        #print(text)
        #print(text[:20])
    #print("int the fn")
    #print(re.sub(r'\n\n+', '\n', text)[512:])
    #print("done")
        return re.sub(r'\n\n+', '\n', text)[512:]
    #else:
        #print(text)
#text = book_corpus.text #.apply(str.upper)
#print(text[0])
book_corpus.dropna(axis=0, how='any', inplace=True)
book_corpus["clean_text"] = book_corpus.text.apply(lambda x: clean_text(x))

book_corpus

Unnamed: 0,text,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,id_2,clean_text
0,\n\n\n\nTHE HOUSE ON THE BORDERLAND\n\nWilliam...,PG10002,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],666,{'Science fiction'},book_0,"\nOf the wind in the dark.\n Hush and hark, w..."
1,\n\n\n\nThis file was produced from images gen...,PG10003,"My First Years as a Frenchwoman, 1876-1879","Waddington, Mary King",1833.0,1923.0,['en'],43,"{'France -- Social life and customs', 'France ...",book_1,T VERSAILLES\n III. M. WADDINGTON AS MINISTER ...
2,and the Online Distributed Proofreading Team.\...,PG10004,The Warriors,"Lindsay, Anna Robertson Brown",1864.0,1948.0,['en'],27,{'Christianity'},book_2,nd of President McKinley. There\nhas been the ...
3,and PG Distributed Proofreaders\n\n\n\n\n ...,PG10005,A Voyage to the Moon: With Some Account of the...,"Tucker, George",1775.0,1861.0,['en'],58,"{'Space flight to the moon -- Fiction', 'Scien...",book_3,1827\n CONTENTS.\n C...
4,\n\n\n\nLA FIAMMETTA\n\nBY\n\nGIOVANNI BOCCACC...,PG10006,La Fiammetta,"Boccaccio, Giovanni",1313.0,1375.0,['en'],43,{'Fiction'},book_4,ned with his son when the child was seven year...
...,...,...,...,...,...,...,...,...,...,...,...
39309,"\n\n\n\n\n\n\n\n\n\n""TIS SIXTY YEARS SINCE""\n\...",PG9996,"""'Tis Sixty Years Since"": Address of Charles F...","Adams, Charles Francis",1835.0,1915.0,['en'],12,"{'Philosophy, Modern'}",book_2030,"bmit, I have given the\nwords ""'Tis Sixty Year..."
39310,Distributed Proofreaders\n\n\n\n\n\n\n\n\n\n\n...,PG9997,"France and England in North America, Part III:...","Parkman, Francis",1823.0,1893.0,['en'],34,{'Canada -- History -- To 1763 (New France)'},book_90,otives and even the incidents have been but\np...
39311,\n\n\n\nPOEMS\n\nBY\n\nMATILDA BETHAM.\n\n\n18...,PG9998,Poems,"Betham, Matilda",1776.0,1852.0,['en'],23,{'Poetry'},book_493,"to premise, is, that the\ntale in the Old She..."
39312,\n\n\n\n\n\n\n\n\n\n[Illustration: Letter from...,PG9999,"Harriet, the Moses of Her People","Bradford, Sarah H. (Sarah Hopkins)",1818.0,1912.0,['en'],103,"{'Underground Railroad', 'African Americans --...",book_1687,"ES OF HER PEOPLE, may seem a little\nambitious..."


In [6]:
book_corpus.columns = ['old_text', 'id', 'title', 'author', 'authoryearofbirth',
                        'authoryearofdeath', 'language', 'downloads', 'subjects', 'id_2',
                        'text']
book_corpus.text = book_corpus.text.apply(lambda x: x.strip())
clean_book_corpus = book_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))
#print(clean_book_corpus['text'][0])
s = clean_book_corpus.head(100)
clean_book_corpus.astype('str', copy=False).dtypes
print("size:")
print(book_corpus.info(memory_usage='deep'))
print("size:")
print(s.info(memory_usage='deep'))
print("metadata:")
print(book_corpus.drop(['old_text', 'text'], axis=1))

chunked_book_data = build_chunk_dataframe(s) #book_corpus.drop(['old_text', 'text'], axis=1))
nunique_book_data = clean_non_unique(chunked_book_data)
nunique_book_data

size:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39258 entries, 0 to 39313
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   old_text           39258 non-null  object 
 1   id                 39258 non-null  object 
 2   title              39258 non-null  object 
 3   author             39258 non-null  object 
 4   authoryearofbirth  39258 non-null  float64
 5   authoryearofdeath  39258 non-null  float64
 6   language           39258 non-null  object 
 7   downloads          39258 non-null  int64  
 8   subjects           39258 non-null  object 
 9   id_2               39258 non-null  object 
 10  text               39258 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 40.4 GB
None
size:
<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, PG1 to PG10145
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text  

OSError: [Errno 12] Cannot allocate memory

In [None]:
book_corpus.columns

In [None]:
nunique_book_data.to_csv("data/nlp/gutenberg/book_as_csv_preprocessed.csv", index=False)