In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('PoetryFoundationData.csv')

In [None]:
data

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,
...,...,...,...,...,...
13849,13,\r\r\n 1-800-FEAR\r\r\n ...,\r\r\nWe'd like to talk with you about ...,Jody Gladding,"Living,Social Commentaries,Popular Culture"
13850,14,\r\r\n The Death of Atahual...,\r\r\n\r\r\n,William Jay Smith,
13851,15,\r\r\n Poet's Wish\r\r\n ...,\r\r\n\r\r\n,William Jay Smith,
13852,0,\r\r\n 0\r\r\n,\r\r\n Philosophic\r\r\nin its comple...,Hailey Leithauser,"Arts & Sciences,Philosophy"


# Data Cleaning and Pre-processing

In [None]:
data.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
Title,0
Poem,0
Poet,0
Tags,955


## Removing Tags since it has missing values (might handle later)
### Working With Poet, Poem and Title first  

In [None]:
data.drop(['Unnamed: 0', 'Tags'], axis=1)

Unnamed: 0,Title,Poem,Poet
0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting
1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu
2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser
3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri
4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek
...,...,...,...
13849,\r\r\n 1-800-FEAR\r\r\n ...,\r\r\nWe'd like to talk with you about ...,Jody Gladding
13850,\r\r\n The Death of Atahual...,\r\r\n\r\r\n,William Jay Smith
13851,\r\r\n Poet's Wish\r\r\n ...,\r\r\n\r\r\n,William Jay Smith
13852,\r\r\n 0\r\r\n,\r\r\n Philosophic\r\r\nin its comple...,Hailey Leithauser


In [None]:
data['Poet'].value_counts().head()

Unnamed: 0_level_0,count
Poet,Unnamed: 1_level_1
William Shakespeare,85
"Alfred, Lord Tennyson",73
Emily Dickinson,51
William Wordsworth,51
Rae Armantrout,49


In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
regex = re.compile(r'[^a-zA-Z\s]')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import string
stop_words_set = set(stopwords.words('english')) #faster
punct_set = set(string.punctuation) #faster



In [None]:
regex_tokenized_poems = [regex.sub(' ', poem) for poem in data['Poem']]
nltk_tokenized_poems = [word_tokenize(poem) for poem in regex_tokenized_poems]

using nested list comprehensions to loop over and:
 1. remove stopwords
 2. remove punctuations
 3. turn to lowercase

 **next step -> lemmatization**

(I use lemmatization because it gives me real
dictionary words.)

In [None]:
poems = [[token.lower() for token in poem if token not in punct_set and token not in stop_words_set] for poem in nltk_tokenized_poems]
print(poems[0])

['dog', 'bone', 'stapler', 'cribbage', 'board', 'garlic', 'press', 'window', 'loose', 'lacks', 'suction', 'lacks', 'grip', 'bungee', 'cord', 'bootstrap', 'dog', 'leash', 'leather', 'belt', 'window', 'sash', 'cords', 'they', 'frayed', 'they', 'broke', 'feather', 'duster', 'thatch', 'straw', 'empty', 'bottle', 'elmer', 'glue', 'window', 'loud', 'hinges', 'clack', 'open', 'clack', 'shut', 'stuffed', 'bear', 'baby', 'blanket', 'single', 'crib', 'newel', 'window', 'split', 'it', 'dividing', 'two', 'velvet', 'moss', 'sagebrush', 'willow', 'branch', 'robin', 'wing', 'window', 'pane', 'less', 'it', 'frame', 'air']


In [None]:
lem = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
poems_lemmatized = [[lem.lemmatize(token) for token in poem] for poem in poems]
print(poems_lemmatized[0])

['dog', 'bone', 'stapler', 'cribbage', 'board', 'garlic', 'press', 'window', 'loose', 'lack', 'suction', 'lack', 'grip', 'bungee', 'cord', 'bootstrap', 'dog', 'leash', 'leather', 'belt', 'window', 'sash', 'cord', 'they', 'frayed', 'they', 'broke', 'feather', 'duster', 'thatch', 'straw', 'empty', 'bottle', 'elmer', 'glue', 'window', 'loud', 'hinge', 'clack', 'open', 'clack', 'shut', 'stuffed', 'bear', 'baby', 'blanket', 'single', 'crib', 'newel', 'window', 'split', 'it', 'dividing', 'two', 'velvet', 'moss', 'sagebrush', 'willow', 'branch', 'robin', 'wing', 'window', 'pane', 'le', 'it', 'frame', 'air']


'corpus'

Predicting the missing tags, and see how it performs.

In [None]:
#a function to perform all the preprocessing that was already done:

In [None]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer
import nltk
import re
from nltk.corpus import stopwords
import string


lem = WordNetLemmatizer()
nltk.download('wordnet')
stop_words_set = set(stopwords.words('english')) #faster
punct_set = set(string.punctuation) #faster

nltk.download('punkt')
nltk.download('stopwords')
regex = re.compile(r'[^a-zA-Z]')

def preprocess_poem(poems):
  regex_tokenized_poems = [regex.sub(' ', poem) for poem in poems]
  nltk_tokenized_poems = [word_tokenize(poem) for poem in regex_tokenized_poems]
  poems__ = [[token for token in poem if token not in punct_set] for poem in nltk_tokenized_poems]
  poems_ = [[token.lower() for token in poem if token not in stop_words_set] for poem in poems__]
  poems_lemmatized = [[lem.lemmatize(token) for token in poem] for poem in poems_]
  finalized = [' '.join(poem) for poem in poems_lemmatized]
  return finalized


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Vectorization (TF-IDF)
#### There are many other ways (might consider checking them as well)
(BOW, Word Embeddings like Skip-Gram, CBOW, FastText, GloVe)

In [None]:
data = pd.read_csv('PoetryFoundationData.csv')
data.drop(['Unnamed: 0', 'Poet', 'Title'], axis=1, inplace=True)
data

Unnamed: 0,Poem,Tags
0,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",
1,"\r\r\nThe old cupola glinted above the clouds,...",
2,\r\r\nLook for me under the hood\r\r\nof that ...,
3,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",
4,\r\r\nWhen I push your button\r\r\nyou fly off...,
...,...,...
13849,\r\r\nWe'd like to talk with you about ...,"Living,Social Commentaries,Popular Culture"
13850,\r\r\n\r\r\n,
13851,\r\r\n\r\r\n,
13852,\r\r\n Philosophic\r\r\nin its comple...,"Arts & Sciences,Philosophy"


In [None]:
print(data.isna().sum())
data.dropna(inplace=True)
print('\n' + 'After dropping NAs')
print(data.isna().sum())

Poem      0
Tags    955
dtype: int64

After dropping NAs
Poem    0
Tags    0
dtype: int64


In [None]:
data

Unnamed: 0,Poem,Tags
6,\r\r\nInvisible fish swim this ghost ocean now...,"Living,Time & Brevity,Relationships,Family & A..."
7,\r\r\nDon’t bother the earth spirit who lives ...,"Religion,The Spiritual,Mythology & Folklore,Fa..."
9,"\r\r\nHour in which I consider hydrangea, a sa...","Living,Parenthood,The Body,The Mind,Nature,Tre..."
16,\r\r\nmy father’s body is a map\r\r\na record ...,"The Body,Family & Ancestors"
17,\r\r\nit has long been forgotten this practice...,"Infancy,Parenthood,The Body"
...,...,...
13835,"\r\r\nDear Writers, I’m compiling the first in...","Relationships,Gay, Lesbian, Queer,Arts & Scien..."
13848,\r\r\nThe Wise Men will unlearn your name.\r\r...,"Living,Death,Growing Old,Time & Brevity,Nature..."
13849,\r\r\nWe'd like to talk with you about ...,"Living,Social Commentaries,Popular Culture"
13852,\r\r\n Philosophic\r\r\nin its comple...,"Arts & Sciences,Philosophy"


In [None]:
data.reset_index(drop=True, inplace=True)
preprocessed_poems = preprocess_poem(data['Poem'])
preprocessed_poems

['invisible fish swim ghost ocean described wave sand water worn rock soon fish learn walk then human come ashore paint dream dying stone then later much later ocean floor punctuated chevy truck carrying dreamer decendants going store',
 'don bother earth spirit life she working story it oldest story world delicate changing if see watching invite coffee give warm bread obligated stay listen but ordinary story you endure earthquake lightning death love blinding beauty it story compelling may never want leave trap see stone finger that one ever escaped',
 'hour i consider hydrangea salt sand plant varietal question varietal diet every mother i know pound feel like i lost i lost yes sense possible beauty grown external i externalize beauty beauty occurs surface plant sun darkens skin child small beautiful i see obvious everything beautiful his hand swell bite spread insect venom small he appears feel nothing he smash skull floor he scream i hold lap kitchen floor front open freezer pressi

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
vectorized_poems = vectorizer.fit_transform(preprocessed_poems)
vectorized_poems

<12899x90205 sparse matrix of type '<class 'numpy.float64'>'
	with 1482771 stored elements in Compressed Sparse Row format>

In [None]:
vectorized_poems.shape

(12899, 90205)

### Multi-class Classification (?) Let's first try by considering a normal classification problem (which we know it would most probably fail, but anyways)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_poems, data['Tags'], test_size=0.2, random_state=42)
from sklearn.svm import LinearSVC
classifier = LinearSVC()

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((10319, 90205), (10319,), (2580, 90205), (2580,))

In [None]:
classifier.fit(X_train, y_train)



In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report

In [None]:
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [None]:
accuracy_score(y_test, y_pred)

0.023255813953488372

# Poor model! Must change the approach
## Must try multi-class training classification