# Create regression training set

This iterates through COHA, opening each file while noting its publication date. Then it divides the text by occurrences of the string "@ @ @ @ @ @ @ @ @ @." It then sentence-tokenizes the resulting segments, and tries to create a sequence of more than 128 Roberta tokens starting with the second sentence.

In [3]:
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize

# import the RoBERTa tokenizer
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
def find_sequence(text):
    if len(text) < 128:
        return ""

    # split the text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return ""
    
    current_idx = 1  # skip the first sentence, because it may not be complete
    sequence = sentences[current_idx]
    tokenlen = len(tokenizer(sequence)['input_ids'])
    while tokenlen < 128 and current_idx < len(sentences)-1:
        current_idx += 1
        sequence += " " + sentences[current_idx]
        tokenlen = len(tokenizer(sequence)['input_ids'])
    
    if tokenlen < 128:
        return ""
    else:
        return sequence
    

In [12]:
def clean_coha(text):
    '''
    Clean the COHA text. There are some odd aspects to this data, notably that
    contractions are represented as separate words "ca n't." Also punctuation is
    always surrounded by ( spaces ) . This function will remove the spaces around
    punctuation and recombine contractions.
    '''

    # remove spaces around punctuation
    text = text.replace(" ,", ",").replace(" .", ".").replace(" ?", "?").replace(" !", "!").replace(" ;", ";").replace(" :", ":").replace(" )", ")").replace("( ", "(")
    text = text.replace(" n't", "n't").replace(" 'll", "'ll")
    text = text.replace(" 've ", "'ve ").replace(" 're ", "'re ").replace(" 'd ", "'d ").replace(" 's ", "'s ").replace(" 'm ", "'m ")
    return text

In [13]:
files = os.listdir('coha')
files = [f for f in files if f.endswith('.txt')]

sequences = []
pubdates = []

for file in files:
    print(file, len(sequences))
    pubdate = int(file.split('_')[1])

    with open('coha/' + file, 'r') as f:
        text = f.read()
        text = clean_coha(text)
    
    # split the text at occurrences of '@ @ @ @ @ @ @ @ @ @'
    text_segments = text.split('@ @ @ @ @ @ @ @ @ @')

    for seg in text_segments:
        seq = find_sequence(seg)
        if seq:
            sequences.append(seq)
            pubdates.append(pubdate)

df = pd.DataFrame({'text': sequences, 'label': pubdates})


mag_1865_528750.txt 0
mag_1977_291350.txt 39
news_1953_715250.txt 41
news_1916_682750.txt 47
fic_1979_10650.txt 48
mag_1891_572250.txt 1249
mag_1972_491250.txt 1279
mag_1899_499750.txt 1311
news_1963_723550.txt 1344
mag_1960_267550.txt 1346
mag_1928_163250.txt 1349
mag_1946_160050.txt 1351
mag_2000_408850.txt 1354
mag_1988_337150.txt 1363
mag_1945_159350.txt 1365
fic_1999_44350.txt 1366
news_1989_669750.txt 1380
mag_1935_154650.txt 1386
fic_1946_781950.txt 1389
fic_1873_5050.txt 1716
nf_2003_776750.txt 2129
mag_1923_467550.txt 2143
news_1909_781450.txt 2172
mag_1950_232250.txt 2178
news_1948_711050.txt 2180
mag_1977_287650.txt 2184
mag_1976_290750.txt 2187
mag_1986_333850.txt 2189
news_1917_683350.txt 2192
news_2006_636150.txt 2195
mag_2004_365050.txt 2203
news_1983_669250.txt 2204
news_1965_724750.txt 2216
news_1999_653850.txt 2220
news_1979_734350.txt 2229
news_1981_669150.txt 2233
mag_1975_494950.txt 2239
news_1969_727550.txt 2263
mag_1987_335450.txt 2267
mag_1923_220050.txt 2270
fi

In [16]:
df.to_csv('coha_training_data.tsv', index=False, sep='\t')

In [14]:
df.head()

Unnamed: 0,text,label
0,"YOU know, dear M., it is said that in times of...",1865
1,"These, like swearing and smoking, are strictly...",1865
2,For who can doubt that the senses are entitled...,1865
3,Is it strange that even the moss-covered Carli...,1865
4,After breakfast one will be as expectant as if...,1865


In [15]:
# Iterate through df, and count words in each text.
# Count only words that include an apostrophe.

from collections import Counter
import string

allcounts = Counter()

for idx, row in df.iterrows():
    text = row['text']
    words = text.split()
    words = [w for w in words if "'" in w]
    c = Counter(words)
    allcounts += c

allcounts.most_common(100)

[("'", 6131),
 ("don't", 1603),
 ("I'm", 1426),
 ("didn't", 1049),
 ("I'll", 740),
 ("it's", 660),
 ("It's", 635),
 ("can't", 580),
 ("I've", 518),
 ("wasn't", 491),
 ("couldn't", 448),
 ("I'd", 438),
 ("That's", 429),
 ("wouldn't", 413),
 ("that's", 406),
 ("he's", 386),
 ("you're", 375),
 ("won't", 374),
 ("Don't", 334),
 ("hadn't", 289),
 ("isn't", 285),
 ("doesn't", 281),
 ("He's", 270),
 ("You're", 245),
 ("he'd", 243),
 ("ain't", 233),
 ("man's", 211),
 ("father's", 203),
 ("there's", 198),
 ("she's", 172),
 ("'s,", 168),
 ("There's", 168),
 ("she'd", 165),
 ("What's", 161),
 ("'S", 161),
 ("you'll", 157),
 ("you've", 143),
 ("you'd", 137),
 ("haven't", 133),
 ("'s.", 131),
 ("they're", 130),
 ("what's", 125),
 ("mother's", 118),
 ("we're", 118),
 ("Let's", 115),
 ("one's", 110),
 ("She's", 104),
 ("we've", 101),
 ("aren't", 98),
 ("We're", 97),
 ("we'll", 96),
 ("They're", 92),
 ("weren't", 88),
 ("o'clock", 88),
 ("Jack's", 87),
 ("God's", 82),
 ("',", 78),
 ("woman's", 78),
 (