# Years Data Laundromat

This cleans and lemmatizes the raw .txt files for each chapter of *The Years*.

### Imports and Whatnot

In [6]:
import sys
import os
from pprint import pprint as pp
import string

# import nltk
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nbformat

### Setting up corpus and cleaning data 

In [7]:
corpus = [
  'data/1880.txt',
  'data/1891.txt',
  'data/1907.txt',
  'data/1908.txt',
  'data/1910.txt',
  'data/1911.txt',
  'data/1913.txt',
  'data/1914.txt',
  'data/1917.txt',
  'data/1918.txt',
  'data/present.txt'
]

lemmatized_corpus = [
  'data/1880_lemmatized.txt',
  'data/1891_lemmatized.txt',
  'data/1907_lemmatized.txt',
  'data/1908_lemmatized.txt',
  'data/1910_lemmatized.txt',
  'data/1911_lemmatized.txt',
  'data/1913_lemmatized.txt',
  'data/1914_lemmatized.txt',
  'data/1917_lemmatized.txt',
  'data/1918_lemmatized.txt',
  'data/present_lemmatized.txt'
]

# setting nltk resources and stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
default_stopwords_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# removing stopwords and punctuation
extended_punctuation = set(string.punctuation) | {'“', '”', '’', '‘', '—', '…', '\'', '`'}

def remove_stopwords(words):
  return [word for word in words if not word.lower() in default_stopwords_set]

def remove_punctuation(words):
    return [word for word in words if word not in string.punctuation]

# lemmatizing each file in corpus

for i in corpus:
  with open(i, 'r') as f:
    file = f.read()

    # tokenizing each file into words and removing stopwords
    words = word_tokenize(file)
    words = remove_stopwords(words)
    words = remove_punctuation(words)

    # lemmatizing each word individually
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # joining the lemmatized words back into a string
    lemmatized_file = ' '.join(lemmatized_words)

    # writing the lemmatized file to new files
    with open(i.replace('.txt', '_lemmatized.txt'), 'w') as f:
      f.write(lemmatized_file)

# creating a dictionary of words from the lemmatized corpus
word_dictionary = []

for file_path in lemmatized_corpus:
  with open(file_path, 'r') as f:
    file_content = f.read()
    word_dictionary.append(word_tokenize(file_content))
print(word_dictionary)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joshua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/joshua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


