In [1]:
# set up OpenAI
import openai

# read api key from file
with open('resources/api-keys.txt', 'r') as f:
    api_key = f.read()

openai.api_key = api_key

In [34]:
import pandas as pd

# Prepocessing libraries

from langdetect import detect
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
"""
Language detection
"""
def detect_lang(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
"""
Auto-correction using GPT-3
"""
def gpt_autocorrect(text, t_max):
    prompt_string = "You will be receiving a story. Your task to correct any spelling mistakes that might be present in the story. Here is the story: " + text

    r = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt_string,
        temperature=0.3,
        max_tokens=t_max,
        n=1,
        stop=None,
        timeout=10,
    )

    return r['choices'][0]['text']

stop_words = set(stopwords.words('english')) # set of stopwords
punc = set(string.punctuation) # set of all special characters
lemma = WordNetLemmatizer() # lemmatizer


"""
Preprocessing function
"""
def clean(text, max_tokens):
    print("Correcting spelling mistakes...")
    text = gpt_autocorrect(text, max_tokens)
    print("Preprocessing...")
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punc]
    tokens = [lemma.lemmatize(token) for token in tokens]

    processed_text = ' '.join(tokens)
    return processed_text

Simple test for `gpt_autocorrect()`

In [19]:
correction = gpt_autocorrect("i am perrty bad at speling and grammer, i fel so awfel abt it", 20)
print(correction)



I am pretty bad at spelling and grammar, I feel so awful about it.


In [13]:
# read the data
file = open("data/stories.csv", "r")
stories_array = []

for line in file:
    stories_array.append(line)

file.close()

data = pd.DataFrame(stories_array, columns=['story'])

Demo: `gpt_autocorrect()` and `clean()` on a story

In [36]:
print(data['story'][3494])
print(gpt_autocorrect(data['story'][3494], 500))
print("\n")
print(clean(data['story'][3494], 500))

I am an investigative journalist and did a research on the Sugar Mummy scam circus in Singapore. They all operate the same way. No one is what they say they are. I contacted 6 of the agents on Locanto and other sites via WhatsApp and they were all scammers. They might change names but one thing is for 100% sure. You will be scammed! Basically they have a pre-paid phone card with a generic profile photo. They all asure you they are not scammers. After giving them you name, age and civil status they will ask for 300-500 SGDs for a fee. They only accept bank transfer. Then when you have payed this they ask for 1400-1900 SGD for further fees and insurance. They promise you a BMW and a monthly salary of at least 10500 SGD and so on. My conclusion is "DON´T PAY ANYTHING" They are all scammers/fraudsters/liers. Don´t fall for any sweet talk or promises, you will be fooled and no sugar mummy is at the end of the rainbow. No matter who they say they are or that they have lots of clients that re

#### Data preprocessing

In [67]:
# add language column
data["language"] = data["story"].apply(detect_lang)

# filter out non-english stories
data = data[data["language"] == "en"]

# clean the stories
data["story"] = data["story"].apply(clean)

Unnamed: 0,story,language


In [90]:
# get story with highest number of tokens (for GPT-3)
max_tokens = data["story"].apply(lambda story: len(word_tokenize(story))).max()

662

In [95]:
sum = data["story"].apply(lambda story: len(word_tokenize(story))).sum()
sum

571115