In [1]:
import pandas as pd
import numpy as np

import unicodedata
import re
import json
import time
import os
import requests

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [3]:
df.isnull().sum()

id                   0
url_legal         2004
license           2004
excerpt              0
target               0
standard_error       0
dtype: int64

In [4]:
df.shape

(2834, 6)

In [5]:
df.url_legal.value_counts()

https://www.africanstorybook.org/                                                                                          118
https://www.africanstorybook.org/#                                                                                          46
https://simple.wikipedia.org/wiki/Voltage                                                                                    2
https://en.wikipedia.org/wiki/Open-source_software                                                                           1
https://simple.wikipedia.org/wiki/Blu-ray_Disc                                                                               1
                                                                                                                          ... 
https://freekidsbooks.org/wp-content/uploads/2019/10/Freekidsbooks-Area-Apprenticeship-and-Workplace-Mathematics-10.pdf      1
https://en.wikipedia.org/wiki/Electrostatic_generator                                                          

### Takeaways:
- set id as the index
- should remove url_legal andlicense
- do a basic clean on excerpt
    - remove accented characters
    - remove special characters
    - tokenization
    - lemmatization
    - remove stopwords
- late on web scrape from https://www.africanstorybook.org/   

In [6]:
# Set id as index
df.set_index('id', inplace = True)

In [7]:
df.head(1)

Unnamed: 0_level_0,url_legal,license,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009


In [8]:
# Drop url_legal and license columns
df.drop(columns=['url_legal', 'license'], inplace=True)

In [9]:
df.head(1)

Unnamed: 0_level_0,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009


In [10]:
def basic_clean(string):
    '''Takes in string
    makes everything lowercase
    removes incosistent text
    only keeps anything a-z, 0-9, ' and white space'''
    # make everything lowercase
    string = string.lower()
    # removes incosistencies in the text
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # set what to keep
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    # return new cleaned string
    return string

df['cleaned_excerpt'] = df.excerpt.apply(basic_clean)

In [11]:
df.head(1)

Unnamed: 0_level_0,excerpt,target,standard_error,cleaned_excerpt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...


In [12]:
def tokenize(string):
    '''Takes in the string provided by basic_clean funciton
    creates a tokenizer
    uses the tokenizerr on the cleaned string'''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    # return tokenized string
    return string

df['tokenized_excerpt'] = df.excerpt.apply(basic_clean)

In [13]:
df.head(1)

Unnamed: 0_level_0,excerpt,target,standard_error,cleaned_excerpt,tokenized_excerpt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...,when the young people returned to the ballroom...


In [14]:
def stem(string):
    '''In string from the basic_clean and tokenize fucntion
    creaters the porter stemmer
    applies the porter stemmer to every word in the string provided
    joing the list of words back into a string'''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    # Apply the stemmer to each word in our string
    stems = [ps.stem(word) for word in string.split()]
    # Join the list of words into the string
    string_stemmed = ' '.join(stems)
    # return string_stemmed
    return string_stemmed

df['stemmed_excerpt'] = df.tokenized_excerpt.apply(stem)

In [15]:
df.head(1)

Unnamed: 0_level_0,excerpt,target,standard_error,cleaned_excerpt,tokenized_excerpt,stemmed_excerpt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...,when the young people returned to the ballroom...,when the young peopl return to the ballroom it...


In [16]:
def lemmatize(string):
    '''Takes in string from basic_clean and tokenize funcitons
    creates a lematizer
    uses the lematizer on each word in the string
    merges the list of words back into string format
    and returns the now lematized string'''
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    # Use the lemmatizer on each word using split
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # Join the list into a string
    string_lemmatized = ' '.join(lemmas)
    # return lemmatized string
    return string_lemmatized

df['lemma_excerpt'] = df.tokenized_excerpt.apply(stem)

In [17]:
df.head(1)

Unnamed: 0_level_0,excerpt,target,standard_error,cleaned_excerpt,tokenized_excerpt,stemmed_excerpt,lemma_excerpt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...,when the young people returned to the ballroom...,when the young peopl return to the ballroom it...,when the young peopl return to the ballroom it...


In [18]:
def remove_stopwords(string, exclude_words=[], extra_words=[]):
    '''takes in string from basic clean and tokenize fucntions
    takes in a list of words to exclude from the stopword list
    take sin a list of words to include in the stopword list
    makes the list of stopwords
    removes words listed from stopword list
    add words listed to stopword list
    remove words from stopword list from the string
    join words back to string format
    return new string'''
    # set stopword list 
    stopword_list = stopwords.words('english')
    # remove exclude_words list from stopword list
    stopword_list = set(stopword_list) - set(exclude_words)
    # add extra_wrods list to stopword list
    stopword_list = stopword_list.union(set(extra_words))
    # remove stopword list words from string
    string = string.split()
    # set filtered words value
    filtered_words = [word for word in string if word not in stopword_list]
    # join words back into string format 
    string = ' '.join(filtered_words)
    # return new string
    return string

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caitlyncarney/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
document = df.tokenized_excerpt.iloc[0]
remove_stopwords(document, extra_words=['who', 'and'])

'young people returned ballroom presented decidedly changed appearance instead interior scene winter landscape floor covered snowwhite canvas laid smoothly rumpled bumps hillocks like real snow field numerous palms evergreens decorated room powdered flour strewn tufts cotton like snow also diamond dust lightly sprinkled glittering crystal icicles hung branches end room wall hung beautiful bearskin rug rugs prizes one girls one boys game girls gathered one end room boys one end called north pole south pole player given small flag plant reaching pole would easy matter traveller obliged wear snowshoes'

In [None]:
extra_words=['who', 'and']
df['no_stopwords_stem'] = df.stemmed_excerpt.apply(remove_stopwords, extra_words=extra_words)
df.head()

In [None]:
extra_words=['who', 'and']
df['no_stopwords_lemma'] = df.lemma_excerpt.apply(remove_stopwords, extra_words=extra_words)
df.head()