In [1]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

ROWS_COUNT = 10_000

INPUT_DATA = "./resources/train-00000-of-00001.parquet"

pf = ParquetFile(INPUT_DATA) 
parquet_rows = next(pf.iter_batches(batch_size = ROWS_COUNT))

data_df = pa.Table.from_batches([parquet_rows]).to_pandas() 

In [2]:
data_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [3]:
import spacy
nlp = spacy.load("en_core_web_md")

In [4]:
TEXT1 = "\n\n\n\n  pharagraph 1   \n\n\n\n\npharagraph 2\npharagraph 3\n\n"

def sanitize_text(text):
    text = text.strip()
    text = text.replace('\\n\\r', '\n')
    text = text.replace('\\n', '\n')
    text = text.replace('\\\'', '\'')
    text = text.replace('\\"', '"')
    return text.strip()

def get_paragraphs(document):
    start = 0

    for token in document:
        if token.is_space and token.text.count("\n") >= 1:
            yield document[start:token.i]
            start = token.i + 1
    yield document[start:]

def demo_get_paragraphs():
    text1 = sanitize_text(TEXT1)
    text1_doc = nlp(text1)
    
    for pharagraph in get_paragraphs(text1_doc):
        print(pharagraph)

demo_get_paragraphs()

pharagraph 1
pharagraph 2
pharagraph 3


In [5]:
PHARAGRAPH1 = "asdsdad"
PHARAGRAPH2 = "   asdsdad"
PHARAGRAPH3 = "1asdsdad"

def starts_not_letter(pharahraph_doc):
    first_token = pharahraph_doc[0]
    first_token_char = first_token.text[0]
    return not first_token_char.isalpha()

def demo_starts_not_letter(pharagraph):
    paragraph_doc = nlp(pharagraph)
    result = starts_not_letter(paragraph_doc)
    print(result)

demo_starts_not_letter(PHARAGRAPH1)

False


In [None]:
pharagraphs_without_letter = []

for text in data_df['text'].values:
    sanitized_text = sanitize_text(text)
    text_doc = nlp(sanitized_text)
    for pharagraph in get_paragraphs(text_doc):
        if starts_not_letter(pharagraph):
            pharagraphs_without_letter.append(pharagraph)

In [None]:
for pharagraph in pharagraphs_without_letter[:5]:
    print(pharagraph)

In [None]:
words = []
for pharagraph in pharagraphs_without_letter:
    words += [token.lemma_ for token in pharagraph if not token.is_stop and token.pos_ == 'NOUN']


In [None]:
import pandas as pd
from collections import Counter

word_freq = Counter(words)
word_freq = Counter({k: c for k, c in word_freq.items() if c > 1})
word_freq_df = pd.DataFrame(dict(word_freq).items(), columns = ['word', 'count']) \
    .sort_values(by=['count'], ascending=False) \
    .reset_index(drop=True)

word_freq_df[:10]

In [None]:
OUTPUT_FILE_PATH = "./resources/pharagraph_word_frequency.tsv"

word_freq_df.to_csv(OUTPUT_FILE_PATH, sep='\t', encoding='utf-8', index=False)

In [None]:
word_freq_df.shape