In [1]:
import numpy as np
import pandas as pd
import scipy

import spacy
import re
from collections import Counter

import nltk
from nltk.corpus import brown, stopwords
from nltk import word_tokenize

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn import ensemble
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score

# Importing and Parsing Data

In [2]:
# This imports the txt file and names the column 'genre'
longform = pd.read_csv("cats.txt", sep='\n', header=None)
longform.columns = ['genre']

# Grabs first 4 characters of a string
def get_keys(txt):
    return txt[:4]

# Grabs all but first 4 characters of a string
def drop_column_names(txt):
    return txt[4:]

# Function takes in a dirty, longform DataFrame and pops it back out cleaned
# and split into two columns
def longform_cleaning(df):
    df['keys'] = df['genre'].apply(lambda x: get_keys(x))

    df['genre'] = df['genre'].apply(lambda x: drop_column_names(x))

    df['genre'] = df['genre'].apply(lambda x: x.strip())
    
    return df

labels_df = longform_cleaning(longform)

In [3]:
d = {}
list_of_dfs = []
for i in brown.categories():
    d[str(i)] = labels_df[labels_df['genre'] == i]
    if len(d[str(i)]) > 19:
        list_of_dfs.append(d[str(i)][0:20])
        
labels_df = pd.concat(list_of_dfs).reset_index()
labels_df = labels_df.drop(columns=['index'])

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [5]:
# Puts all the article words and punctuation into dataframe column
article_col = []
for article in labels_df['keys']:
    article_col.append(text_cleaner(' '.join(brown.words(fileids=[article]))))

labels_df['article_words'] = article_col

In [6]:
labels_df['word_tokens'] = labels_df['article_words'].apply(lambda x: word_tokenize(x))
labels_df['first_500_words'] = labels_df['word_tokens'].apply(lambda x: x[0:500])

In [7]:
# Puts all the article words and punctuation into dataframe column
article_col500 = []
for article in labels_df['first_500_words']:
    article_col500.append(text_cleaner(' '.join(article)))

labels_df['article_words500'] = article_col500

In [8]:
nlp = spacy.load('en')

def nlp_it(df):
    spacy_articles = []
    for article in df['article_words500'].copy():
        spacy_articles.append(nlp(article))
    df['spacy_articles'] = spacy_articles
    return df
    

# BoW Feature Generation

In [9]:
# Utility function to create a list of the words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the words appearing more than once.
    common_words = []
    for item in Counter(allwords).most_common(500):
        if item[1] > 1:
            print(item[0])
            common_words.append(item[0])
    return common_words

def bow_features(text, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text'] = text['spacy_articles']
    df['genre'] = text['genre']
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 10 == 0:
            print("Processing row {}".format(i))
            
    return df

In [10]:
def bow_it(df):
    common_word_lists = []
    for article in df['spacy_articles']:
        common_word_lists.append(bag_of_words(article))
    flat_list = [item for sublist in common_word_lists for item in sublist]
    common_words = list(set().union(flat_list))
    return common_words

In [11]:
def bag_of_features(df):
    df = nlp_it(df)
    common_words = bow_it(df)
    the_texts = df.loc[:, ['spacy_articles', 'genre']].copy()
    word_counts = bow_features(the_texts, common_words)
    return word_counts

# tf-idf Feature Generation

In [12]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the articles
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,
                             norm=u'l2', #accounts for length, worth trying without
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once. Prevents divide-by-zero errors
                            )

In [13]:
# Take in DataFrame and returns tf-idf features
def tfidf_it(df):
    vecked = vectorizer.fit_transform(df['article_words500'])
    vecked = normalize(vecked)
    vecked_norm_df = pd.DataFrame(data=vecked.toarray())
    return vecked_norm_df

In [14]:
# Takes in DataFrame and returns tf-idf features based on the already-trained model
def tfidf_validation(df):
    vecked = vectorizer.transform(df['article_words500'])
    vecked = normalize(vecked)
    vecked_norm_df = pd.DataFrame(data=vecked.toarray())
    return vecked_norm_df

# Executing and Saving Feature Generation

In [15]:
bow_features = bag_of_features(labels_df)

-PRON-
ann
day
the
budd
work
meadow
fence
see
morgan
n't
night
sleep
leave
country
find
think
little
time
winchester
lean
shovel
when
south
walk
house
woman
``
-PRON-
n't
man
clayton
gavin
try
go
be
say
leave
big
charlie
chair
rock
life
shake
pain
's
lip
do
stick
just
mean
fight
fair
clear
want
come
-PRON-
``
mike
dean
fiske
susan
kiss
the
inside
julia
turn
say
will
talk
horse
try
ride
show
rifle
pistol
bayonet
belt
man
join
feel
evidently
guard
hand
n't
``
-PRON-
war
be
say
the
large
party
time
sioux
people
mr.
manuel
go
n't
come
jason
look
wrong
montero
shout
fort
there
cook
then
little
that
oso
like
and
old
knife
's
get
see
cheyennes
nations
letter
white
year
``
-PRON-
be
say
n't
fire
wilson
big
m
tell
take
quirt
let
wrist
burn
right
the
and
appreciate
sure
leave
morning
long
time
but
place
smile
carwood
know
girl
-PRON-
hall
man
counter
the
clerk
time
go
afternoon
job
when
authority
hang
speak
baldness
neck
underneath
person
slip
scrap
notice
feel
's
``
-PRON-
man
the
's
barton
ran

right
the
stevens
$
sing
strike
hoosegow
inmate
demand
week
fringe
benefit
state
day
convict
cell
iron
bar
agreement
cent
lock
federal
time
union
approval
death
harold
old
brother
baseball
dollar
aristotle
contemplate
bust
homer
heel
shoe
in
the
style
find
texture
white
summer
leather
toe
squared
place
color
throat
straw
weave
design
motor
important
light
unlined
crushed
high
popular
dressy
pump
shade
pastel
hue
tintable
crisp
natural
lacey
open
finish
black
casual
comfort
blue
at
electric
soon
brush
small
clean
-PRON-
old
homecoming
day
path
feel
autumn
``
grad
opponent
course
say
christian
throw
lion
usually
happy
stroll
smell
recall
fall
undergraduate
moment
come
pass
suddenly
meet
recognize
ask
``
development
tax
industrial
the
community
-PRON-
east
greenwich
area
density
industry
providence
problem
suggest
planning
this
land
meet
indicate
try
new
attempt
balance
medium
low
residential
modest
cost
way
exemption
matter
1960
rate
in
attract
grow
municipal
purpose
``
short
lousy
peace

-PRON-
city
the
give
attacker
destroy
rocket
ballistic
advantage
opponent
nation
destruction
mean
energy
long
complete
``
military
national
win
war
know
's
schnabel
the
music
``
great
but
symphonic
-PRON-
schubert
trout
quintet
student
musical
present
player
double
bass
conventional
give
performance
rendition
interpretation
reading
strike
approach
listen
version
playing
teacher
master
trill
class
dog
long
junior
year
mrs.
``
westminster
juniors
show
finals
entry
-PRON-
jr.
ring
competition
far
present
speaker
william
h.
speak
welcome
state
know
feel
experience
betsey
win
walk
when
fun
way
want
right
attitude
take
boat
water
lake
people
body
find
mile
trailer
gear
time
united
states
today
recreational
use
nation
sea
operate
the
area
river
result
kind
provide
create
texoma
engineers
reservoir
bring
mean
$
load
trip
``
lock
signal
tappet
bar
lever
point
clear
the
move
pull
switch
notch
dog
turnout
throw
mechanical
interlock
frame
track
arrangement
in
set
prevent
operator
connect
if
short


man
's
captain
voyage
the
discovery
sea
find
time
able
hudson
company
1610
arctic
water
north
seventeen
month
board
go
what
great
year
highly
second
picture
determine
american
half
this
east
in
muscovy
dutch
red
selkirk
great
the
river
settlement
douglas
settle
valley
's
southward
scots
fort
north
empire
hudson
bay
company
mile
assiniboine
october
year
group
york
factory
american
``
man
1812
little
swiss
mercenary
war
settler
plot
late
1818
``
-PRON-
a
write
like
letter
humor
rich
find
man
gray
another
report
house
expression
hungry
horse
corner
god
receive
girl
home
love
wife
forgit
yank
american
nationalism
folklore
america
popularity
century
history
action
point
literature
the
proportion
emphasis
influence
world
twentieth
historian
fact
international
legend
contemporary
occur
country
hope
-PRON-
group
national
identification
apply
spread
dominion
palm
pine
united
states
year
society
course
personal
's
-PRON-
``
o'banion
the
right
arrangement
torrio
irish
's
gun
pocket
left
kill
poli

negro
beat
store
burn
anne
arundel
secret
-PRON-
submarine
's
dreadnought
united
states
sub
the
trial
british
bail
london
atomic
navy
build
nuclear
steal
testify
year
american
hold
court
end
a
miss
gee
mystery
man
moscow
couple
in
krogers
week
dot
plead
80
grant
``
old
``
-PRON-
man
street
but
the
youth
come
church
miss
sound
smell
wrap
or
call
be
country
unusual
go
hand
give
dead
n't
look
think
torino
-PRON-
maggie
baby
``
's
n't
stuart
watch
take
people
head
in
food
lug
father
thing
soap
find
eugenia
say
good
``
-PRON-
tolley
go
will
be
's
picture
yes
laban
everybody
frank
n't
read
tenant
this
idea
jenny
laugh
mamma
stay
kizzie
let
want
what
room
tell
marry
dress
god
man
woman
-PRON-
``
henrietta
doaty
feel
's
leave
find
good
know
adelia
mama
charles
wish
run
away
perhaps
think
there
hetty
small
papa
enormously
safe
poor
mother
hand
go
choice
no
temptation
-PRON-
japanese
man
girl
tommy
the
momoyama
come
miyagi
prefecture
main
island
people
ancestor
white
eye
tall
high
land
request
d

In [16]:
bow_features['article_words500'] = labels_df['article_words500'].copy()

In [19]:
bow_train, bow_valid = train_test_split(bow_features, random_state=24, stratify=bow_features['genre'])

bow_train = bow_train.reset_index(drop=True)
bow_valid = bow_valid.reset_index(drop=True)

tfidf_features = tfidf_it(bow_train) # fit_transform
tfidf_valid = tfidf_validation(bow_valid) # transform

tfbow_train = pd.concat([bow_train, tfidf_features], ignore_index=False, axis=1)
tfbow_valid = pd.concat([bow_valid, tfidf_valid], ignore_index=False, axis=1)

- bow_train = BoW for training set
- bow_valid = BoW for the validation set
- tfidf_features = tf-idf for the training set
- tfidf_valid = tf-idf for the validation set
- tfbow_train = both for training set
- tfbow_valid = both for validation set

In [20]:
# Bag of Words Features (still contains some non-feature columns)
bow_train.to_csv("bow_train.csv")
bow_valid.to_csv("bow_valid.csv")

# tf-idf Features
tfidf_features.to_csv("tfidf_features.csv")
tfidf_valid.to_csv("tfidf_valid.csv")

# Bag of Words & tf-idf Features
tfbow_train.to_csv("tfbow_train.csv")
tfbow_valid.to_csv("tfbow_valid.csv")

# Clustering

In [21]:
y = bow_train['genre']

### Select K-Best, K=400

In [22]:
kbest = SelectKBest(chi2, k=400)

bow_400_best = kbest.fit_transform(bow_train.drop(columns=['genre', 'text', 'article_words500']), y)
tfidf_400_best = kbest.fit_transform(tfidf_features, y)
tfbow_400_best = kbest.fit_transform(tfbow_train.drop(columns=['genre', 'text', 'article_words500']), y)

In [23]:
best_features_list = ['BoW Features', 'tf-idf Features', 'tf-idf & BoW Features']
bf_dict = {'BoW Features' : bow_400_best,
            'tf-idf Features' : tfidf_400_best,
            'tf-idf & BoW Features' : tfbow_400_best}

## K-Means

In [24]:
kmeans = KMeans(n_clusters=11, random_state=24)
for feature_list in best_features_list:
    y_pred = kmeans.fit_predict(normalize(bf_dict[feature_list]))
    print('K-Means Adjusted Rand Score for {}:'.format(feature_list), adjusted_rand_score(y, y_pred))

K-Means Adjusted Rand Score for BoW Features: 0.1097160859041838
K-Means Adjusted Rand Score for tf-idf Features: 0.1406419396169388
K-Means Adjusted Rand Score for tf-idf & BoW Features: 0.1097160859041838


## Mini Batch K-Means

In [25]:
mbkmeans = MiniBatchKMeans(n_clusters=11, random_state=24, batch_size=1000)
for feature_list in best_features_list:
    y_pred = mbkmeans.fit_predict(normalize(bf_dict[feature_list]))
    print('Mini Batch K-Means Adjusted Rand Score for {}:'.format(feature_list), adjusted_rand_score(y, y_pred))

Mini Batch K-Means Adjusted Rand Score for BoW Features: 0.09627366355274389
Mini Batch K-Means Adjusted Rand Score for tf-idf Features: 0.15979784193296342
Mini Batch K-Means Adjusted Rand Score for tf-idf & BoW Features: 0.09627366355274389


## K-Means & Mini Batch Summary
- tf-idf seems to generate more accurate clusters than BoW
- The clusters are highly inconsistent from one run to the next if not for specifying 'random_state'
- Using more features is better to a certain point. Performance drops beyond 450 features.

## Spectral Clustering

In [26]:
sc = SpectralClustering(n_clusters=11, random_state=24)
for feature_list in best_features_list:
    y_pred = sc.fit_predict(normalize(bf_dict[feature_list]))
    print('Spectral Clustering Adjusted Rand Score for {}:'.format(feature_list), adjusted_rand_score(y, y_pred))

Spectral Clustering Adjusted Rand Score for BoW Features: 0.0999118660363753
Spectral Clustering Adjusted Rand Score for tf-idf Features: 0.07231433381590929
Spectral Clustering Adjusted Rand Score for tf-idf & BoW Features: 0.0999118660363753


## Affinity Propogation

In [27]:
ap = AffinityPropagation(damping=.999999)
for feature_list in best_features_list:
    y_pred = ap.fit_predict(normalize(bf_dict[feature_list]))
    print('Affinity Propogation Adjusted Rand Score for {}:'.format(feature_list), adjusted_rand_score(y, y_pred))

Affinity Propogation Adjusted Rand Score for BoW Features: 0.04893242509308154
Affinity Propogation Adjusted Rand Score for tf-idf Features: 0.0
Affinity Propogation Adjusted Rand Score for tf-idf & BoW Features: 0.04893242509308154




## Spectral Clustering & Affinity Propogation Summary
- Spectral clustering was comparably effective to k-means. I will have to try that out.
- Affinity Propogation seems the wrong method for this task

## Saving best clusters for use in other Notebook

In [28]:
# Mini Batch K-Means w/ tf-idf for the Training set
best_clusters_train = pd.DataFrame(mbkmeans.fit_predict(normalize(tfidf_400_best)))
best_clusters_train.columns = ['mbkmeans_tfidf_clusters']
best_clusters_train.to_csv('best_clusters_train.csv')

In [30]:
# Mini Batch K-Means w/ tf-idf for the Validation set
tfidf_400_valid = kbest.fit_transform(tfidf_valid, bow_valid['genre'])
best_clusters_valid = pd.DataFrame(mbkmeans.predict(normalize(tfidf_400_valid)))
best_clusters_valid.columns = ['mbkmeans_tfidf_clusters']
best_clusters_valid.to_csv('best_clusters_valid.csv')

- bow_train = BoW for training set
- bow_valid = BoW for the validation set
- tfidf_features = tf-idf for the training set
- tfidf_valid = tf-idf for the validation set
- tfbow_train = both for training set
- tfbow_valid = both for validation set

- best_clusters_train = clusters for the training set
- best_clusters_valid = clusters for the validation set