In [1]:
import pandas as pd
import glob
import lxml.html
import random
import itertools
import math
import nltk
import string
from collections import Counter
from nltk.corpus import wordnet as w
MIN_SENT_LENGTH = 10
ENTRY_NUMBER = 5000
random_state = 114514

### 1. gpt-3.5-turbo

In [5]:
def munge_turbo(files):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)
    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        print(gpt.shape)
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv('turbo_new.csv', index=False)
    return gpt
munge_turbo(glob.glob('../turbo_generator/data/*.txt'))

(8168, 2)


Unnamed: 0,text,generated
5195,Instead of building alliances and promoting de...,1
2219,The death of Jeffrey Epstein in a Manhattan ja...,1
7864,One of the most significant benefits of 5G is ...,1
4412,This request comes after Flynn pleaded guilty ...,1
2345,The motivation for inventing the brick-laying ...,1
...,...,...
6488,"So, what does it mean to be more honest? It st...",1
7810,1. Normalize Failure - Share your own failures...,1
7014,"In 2012, Iranian hackers attacked the computer...",1
803,Buttigieg’s position on the importance of mili...,1


In [7]:
def munge_gpt3(files):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({
                    'text': text_list,
                    'generated': 1,
                       })
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        print(gpt.shape)
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv('scraped_gpt3.csv', index=False)
    return gpt
munge_gpt3(glob.glob('../gpt_scraper/data/*.txt'))

(17385, 2)


Unnamed: 0,text,generated
4229,"On January 19, 2020, The New York Times publis...",1
13352,"The episode that is set to air on April 29, 20...",1
14615,The Kansas City Chiefs advanced to the AFC Cha...,1
201,One of the largest grants given by the NEH is ...,1
4549,"At the same time, there are still many barrier...",1
...,...,...
6029,"Regardless of the specific issues at play, it ...",1
11844,The New Mexico lawsuit seeks various forms of ...,1
4867,Gray’s legacy extends beyond his own works to ...,1
7837,"For Cassius, honoring his brother’s memory is ...",1


In [36]:
def munge_gpt(keyword, chunk = True):
    dfs = []
    files = glob.glob(f'gpt_data/{keyword}*.csv')
    for file in files:
        raw_df = pd.read_csv(file)
        text_list = [str(t).strip() for t in raw_df['text']]
        if chunk:
            text_list = [t.split('\n') for t in text_list]
            text_list = list(itertools.chain(*text_list))
            text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv(f'{keyword}.csv', index=False)
    return gpt

In [37]:
munge_gpt('small-117M-k40')
munge_gpt('small-117M')
munge_gpt('medium-345M-k40')
munge_gpt('medium-345M')
munge_gpt('large-762M-k40')
munge_gpt('large-762M')
munge_gpt('xl-1542M-k40')
munge_gpt('xl-1542M')

Unnamed: 0,text,generated
165307,The Federal Home and Community-Based Services ...,1
1969,Snake & Pistol The monkey mask is in a hole to...,1
170212,I am still waiting to hear back on two of the ...,1
183081,"We have not been able to solve this case, but ...",1
72960,McKenna is getting more benefits than just wit...,1
...,...,...
41502,This section concerns content related to Warcr...,1
131318,The U.S. stationing of an anti-missile defense...,1
105242,The 2099 panel is one of the highlights of thi...,1
45921,Jackson County officials are interested in own...,1


In [38]:
BLOG_FILE_NUM = 0 # we will be using scraped data only
def parse_blog_xml(file):
    tree = lxml.html.parse(file)
    post_els = tree.findall('.//post')
    posts = [p.text.replace('urlLink', '').strip() for p in post_els]
    return posts

def get_blog_dfs(fake=False):
    dfs = []
    blog_files = glob.glob('blogs/*.xml')[:BLOG_FILE_NUM]
    for file in blog_files:
        posts = parse_blog_xml(file)
        generated = [1 if random.random() < 0.5 else 0 for _ in posts] if fake else 0
        df = pd.DataFrame({'text': posts,
                       'generated': generated})
        dfs.append(df)
    return dfs

In [39]:
def process_human(fake=False):
    dfs = []
    nyt_scraped_files = glob.glob('../nyt_scraper/data/*.txt')
    for file in nyt_scraped_files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = text_list[1:] # skip titles
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
        df = pd.DataFrame({'text': text_list,
                       'generated': generated})
        dfs.append(df)

    human = pd.concat(dfs, ignore_index=True)
    outfile = 'fake.csv' if fake else 'human.csv'
    if ENTRY_NUMBER:
        human = human.sample(ENTRY_NUMBER, random_state=random_state)
    human.to_csv(outfile, index=False)
    return human

In [40]:
human = process_human(False)
human = process_human(True)