In [68]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
import nltk
from nltk.tokenize import word_tokenize
import spacy
import en_core_web_sm

%matplotlib inline

### Import our text data

In [59]:
wild = open(r"C:\Users\Greg\Documents\callofthewild1.txt", encoding='utf-8')
wild = wild.read()

In [60]:
wild



### Let's replace the line break(\n) and replace it with a space.

In [61]:
# removing \n and replacing it with a black space
wild = wild.replace('\n', ' ').replace('\r', '')
wild



In [62]:
# printing our stop words, this are common words that appear frequently in all kinds of books, publications and on the web.
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [63]:
nlp = en_core_web_sm.load()
wild_doc = nlp(wild)

In [64]:
# let's what we are working with and that our text data is setup correctly
print("The alice_doc object is a {} object.".format(type(wild_doc)))
print("It is {} tokens long".format(len(wild_doc)))
print("The first three tokens are '{}'".format(wild_doc[:3]))
print("The type of each token is {}".format(type(wild_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 37517 tokens long
The first three tokens are '﻿Buck did not'
The type of each token is <class 'spacy.tokens.token.Token'>


In [69]:
# let's make each individual word actually an individual "datapoint" per say
tokens = word_tokenize(wild)

In [70]:
print(tokens[:100])

['\ufeffBuck', 'did', 'not', 'read', 'the', 'newspapers', ',', 'or', 'he', 'would', 'have', 'known', 'that', 'trouble', 'was', 'brewing', ',', 'not', 'alone', 'for', 'himself', ',', 'but', 'for', 'every', 'tide-water', 'dog', ',', 'strong', 'of', 'muscle', 'and', 'with', 'warm', ',', 'long', 'hair', ',', 'from', 'Puget', 'Sound', 'to', 'San', 'Diego', '.', 'Because', 'men', ',', 'groping', 'in', 'the', 'Arctic', 'darkness', ',', 'had', 'found', 'a', 'yellow', 'metal', ',', 'and', 'because', 'steamship', 'and', 'transportation', 'companies', 'were', 'booming', 'the', 'find', ',', 'thousands', 'of', 'men', 'were', 'rushing', 'into', 'the', 'Northland', '.', 'These', 'men', 'wanted', 'dogs', ',', 'and', 'the', 'dogs', 'they', 'wanted', 'were', 'heavy', 'dogs', ',', 'with', 'strong', 'muscles', 'by', 'which', 'to']


In [71]:
# let's remove punctuation as we are more interested in the words
words = [word for word in tokens if word.isalpha()]
print(words[:100])

['did', 'not', 'read', 'the', 'newspapers', 'or', 'he', 'would', 'have', 'known', 'that', 'trouble', 'was', 'brewing', 'not', 'alone', 'for', 'himself', 'but', 'for', 'every', 'dog', 'strong', 'of', 'muscle', 'and', 'with', 'warm', 'long', 'hair', 'from', 'Puget', 'Sound', 'to', 'San', 'Diego', 'Because', 'men', 'groping', 'in', 'the', 'Arctic', 'darkness', 'had', 'found', 'a', 'yellow', 'metal', 'and', 'because', 'steamship', 'and', 'transportation', 'companies', 'were', 'booming', 'the', 'find', 'thousands', 'of', 'men', 'were', 'rushing', 'into', 'the', 'Northland', 'These', 'men', 'wanted', 'dogs', 'and', 'the', 'dogs', 'they', 'wanted', 'were', 'heavy', 'dogs', 'with', 'strong', 'muscles', 'by', 'which', 'to', 'toil', 'and', 'furry', 'coats', 'to', 'protect', 'them', 'from', 'the', 'frost', 'Buck', 'lived', 'at', 'a', 'big', 'house']


In [73]:
# now we will filter out the stopwords that we previously printed earlier.
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)



In [74]:
# now let's import PorterStemmer so that we can get the stems of our words.
from nltk.stem.porter import PorterStemmer

In [75]:
porter = PorterStemmer()
stemming = [porter.stem(word) for word in tokens]
print(stemming)

['\ufeffbuck', 'did', 'not', 'read', 'the', 'newspap', ',', 'or', 'he', 'would', 'have', 'known', 'that', 'troubl', 'wa', 'brew', ',', 'not', 'alon', 'for', 'himself', ',', 'but', 'for', 'everi', 'tide-wat', 'dog', ',', 'strong', 'of', 'muscl', 'and', 'with', 'warm', ',', 'long', 'hair', ',', 'from', 'puget', 'sound', 'to', 'san', 'diego', '.', 'becaus', 'men', ',', 'grope', 'in', 'the', 'arctic', 'dark', ',', 'had', 'found', 'a', 'yellow', 'metal', ',', 'and', 'becaus', 'steamship', 'and', 'transport', 'compani', 'were', 'boom', 'the', 'find', ',', 'thousand', 'of', 'men', 'were', 'rush', 'into', 'the', 'northland', '.', 'these', 'men', 'want', 'dog', ',', 'and', 'the', 'dog', 'they', 'want', 'were', 'heavi', 'dog', ',', 'with', 'strong', 'muscl', 'by', 'which', 'to', 'toil', ',', 'and', 'furri', 'coat', 'to', 'protect', 'them', 'from', 'the', 'frost', '.', 'buck', 'live', 'at', 'a', 'big', 'hous', 'in', 'the', 'sun-kiss', 'santa', 'clara', 'valley', '.', 'judg', 'miller', '’', 's', '

### Conclusion
We have now completed the first couple steps required to clean up text data and prepare it for modeling. Using some simple techniques like tokenization and some more complex like stemming we can now do some data exploration to see the use frequency of words in London's novel. Next we will find the Term Frequency(TF) values for our data as well the TF-TDF. This will allow us to setup our model for the next step.