In [1]:
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import os
import re
from string import punctuation
DIRNAME = 'blogs/'

In [2]:
def make_wordlist(filelist):
    '''
    Takes a list of filenames with XML content, opens these, and provides 
    a wordlist for all these posts.
    '''
    corpus = ''
    for eachfile in filelist:
        xmltext = open(eachfile, encoding='utf8', errors = 'replace').read()
        soup = BeautifulSoup(xmltext, features = 'xml')
        corpus += '/n'.join(x.text.lower() for x in soup.findAll('post'))
    wordlist = word_tokenize(corpus, language='english', preserve_line=True)
    return wordlist

In [3]:
blogtweens = [(DIRNAME + fn) for fn in os.listdir(DIRNAME) if re.search(r'\.1[1-2]\.', fn)]
blogteens = [(DIRNAME + fn) for fn in os.listdir(DIRNAME) if re.search(r'\.1[3-9]\.', fn)]
blogtwenties = [(DIRNAME + fn) for fn in os.listdir(DIRNAME) if re.search(r'\.2\d\.', fn)]
blogthirties = [(DIRNAME + fn) for fn in os.listdir(DIRNAME) if re.search(r'\.3\d\.', fn)]

In [4]:
teenwords = make_wordlist(blogteens[:10])

In [5]:
stemmer = PorterStemmer()
stemmed_teens = [stemmer.stem(x) for x in teenwords]

In [18]:
from nltk import download
download('punkt')

[nltk_data] Downloading package punkt to /home/justin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
list(zip(teenwords, stemmed_teens))

[('i', 'i'),
 ('dont', 'dont'),
 ('no', 'no'),
 ('y', 'y'),
 ('i', 'i'),
 ('put', 'put'),
 ('that', 'that'),
 ('as', 'as'),
 ('the', 'the'),
 ('title', 'titl'),
 ('i', 'i'),
 ('just', 'just'),
 ('couldnt', 'couldnt'),
 ('think', 'think'),
 ('of', 'of'),
 ('anything.', 'anything.'),
 ('if', 'if'),
 ('any', 'ani'),
 ('of', 'of'),
 ('my', 'my'),
 ('mates', 'mate'),
 ('r', 'r'),
 ('reading', 'read'),
 ('this', 'thi'),
 ('plz', 'plz'),
 ('e-mail', 'e-mail'),
 ('me', 'me'),
 ('as', 'as'),
 ('my', 'my'),
 ('in', 'in'),
 ('box', 'box'),
 ('has', 'ha'),
 ('been', 'been'),
 ('empty', 'empti'),
 ('for', 'for'),
 ('yonks.', 'yonks.'),
 ('im', 'im'),
 ('soo', 'soo'),
 ('tierd', 'tierd'),
 ('at', 'at'),
 ('the', 'the'),
 ('moment', 'moment'),
 ('i', 'i'),
 ('got', 'got'),
 ('no', 'no'),
 ('sleep', 'sleep'),
 ('last', 'last'),
 ('nite', 'nite'),
 ('as', 'as'),
 ('i', 'i'),
 ('was', 'wa'),
 ('whatching', 'whatch'),
 ('rush', 'rush'),
 ('hour', 'hour'),
 ('2.', '2.'),
 ('got', 'got'),
 ('ta', 'ta'),
 (

In [26]:
sample = 'I opened a can of peas. Yes, he can run for office.'
[word_tokenize(sent, language='english', preserve_line=True) for sent in sent_tokenize(str(teenwords), language='english')]

[['[',
  "'",
  'i',
  "'",
  ',',
  "'dont",
  "'",
  ',',
  "'no",
  "'",
  ',',
  "'",
  'y',
  "'",
  ',',
  "'",
  'i',
  "'",
  ',',
  "'put",
  "'",
  ',',
  "'that",
  "'",
  ',',
  "'as",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'title",
  "'",
  ',',
  "'",
  'i',
  "'",
  ',',
  "'just",
  "'",
  ',',
  "'couldnt",
  "'",
  ',',
  "'think",
  "'",
  ',',
  "'of",
  "'",
  ',',
  "'anything",
  '.'],
 ["'",
  ',',
  "'if",
  "'",
  ',',
  "'any",
  "'",
  ',',
  "'of",
  "'",
  ',',
  "'my",
  "'",
  ',',
  "'mates",
  "'",
  ',',
  "'",
  'r',
  "'",
  ',',
  "'reading",
  "'",
  ',',
  "'this",
  "'",
  ',',
  "'plz",
  "'",
  ',',
  "'",
  'e-mail',
  "'",
  ',',
  "'me",
  "'",
  ',',
  "'as",
  "'",
  ',',
  "'my",
  "'",
  ',',
  "'in",
  "'",
  ',',
  "'box",
  "'",
  ',',
  "'has",
  "'",
  ',',
  "'been",
  "'",
  ',',
  "'empty",
  "'",
  ',',
  "'for",
  "'",
  ',',
  "'yonks",
  '.'],
 ["'",
  ',',
  "'im",
  "'",
  ',',
  "'soo",
  "'",
  ',',
  "'tierd",
  "'",


In [27]:
pos_tagged = [pos_tag(word_tokenize(sent, language='english', preserve_line=True)) for sent in sent_tokenize(str(teenwords), language='english')]

In [28]:
pos_tagged

[[('[', 'NN'),
  ("'", "''"),
  ('i', 'NN'),
  ("'", "''"),
  (',', ','),
  ("'dont", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'no", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'", "''"),
  ('y', 'NN'),
  ("'", "''"),
  (',', ','),
  ("'", "''"),
  ('i', 'NN'),
  ("'", "''"),
  (',', ','),
  ("'put", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'that", 'WP'),
  ("'", "''"),
  (',', ','),
  ("'as", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'the", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'title", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'", "''"),
  ('i', 'NN'),
  ("'", "''"),
  (',', ','),
  ("'just", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'couldnt", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'think", "''"),
  ("'", "''"),
  (',', ','),
  ("'of", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'anything", 'VBG'),
  ('.', '.')],
 [("'", "''"),
  (',', ','),
  ("'if", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'any", "''"),
  ("'", 'POS'),
  (',', ','),
  ("'of", 'NNP'),
  ("'", 'POS'),
  (',', 

In [30]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(eachword) for eachword in word_tokenize(str(teenwords), language='english')]

In [31]:
lemmatized

['[',
 "'",
 'i',
 "'",
 ',',
 "'dont",
 "'",
 ',',
 "'no",
 "'",
 ',',
 "'",
 'y',
 "'",
 ',',
 "'",
 'i',
 "'",
 ',',
 "'put",
 "'",
 ',',
 "'that",
 "'",
 ',',
 "'as",
 "'",
 ',',
 "'the",
 "'",
 ',',
 "'title",
 "'",
 ',',
 "'",
 'i',
 "'",
 ',',
 "'just",
 "'",
 ',',
 "'couldnt",
 "'",
 ',',
 "'think",
 "'",
 ',',
 "'of",
 "'",
 ',',
 "'anything",
 '.',
 "'",
 ',',
 "'if",
 "'",
 ',',
 "'any",
 "'",
 ',',
 "'of",
 "'",
 ',',
 "'my",
 "'",
 ',',
 "'mates",
 "'",
 ',',
 "'",
 'r',
 "'",
 ',',
 "'reading",
 "'",
 ',',
 "'this",
 "'",
 ',',
 "'plz",
 "'",
 ',',
 "'",
 'e-mail',
 "'",
 ',',
 "'me",
 "'",
 ',',
 "'as",
 "'",
 ',',
 "'my",
 "'",
 ',',
 "'in",
 "'",
 ',',
 "'box",
 "'",
 ',',
 "'has",
 "'",
 ',',
 "'been",
 "'",
 ',',
 "'empty",
 "'",
 ',',
 "'for",
 "'",
 ',',
 "'yonks",
 '.',
 "'",
 ',',
 "'im",
 "'",
 ',',
 "'soo",
 "'",
 ',',
 "'tierd",
 "'",
 ',',
 "'at",
 "'",
 ',',
 "'the",
 "'",
 ',',
 "'moment",
 "'",
 ',',
 "'",
 'i',
 "'",
 ',',
 "'got",
 "'",
 ',',
 "'no",
 "'