In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import spacy

In [3]:
# instantiate spacy parser
nlp = spacy.load('en')

In [4]:
# pull out sample text from review database
df = pd.read_csv('content.csv')
sample = df.content
sample = sample[0]

In [5]:
# parse sample text with spacy object
parsed_review = nlp(sample)

In [6]:
# spacy can separate out sentences as below
for num, sentence in enumerate(parsed_review.sents):
    print('Sentence {}:'.format(num+1))
    print(sentence)
    print("")

Sentence 1:
“Trip-hop” eventually became a ’90s punchline, a music-press shorthand for “overhyped hotel lounge music.

Sentence 2:
” But today, the much-maligned subgenre almost feels like a secret precedent.

Sentence 3:
Listen to any of the canonical Bristol-scene albums of the mid-late ’90s, when the genre was starting to chafe against its boundaries, and you’d think the claustrophobic, anxious 21st century started a few years ahead of schedule.

Sentence 4:
Looked at from the right angle, trip-hop is part of an unbroken chain that runs from the abrasion of ’80s post-punk to the ruminative pop-R&B-dance fusion of the moment. 

Sentence 5:
The best of it has aged far more gracefully (and forcefully) than anything recorded in the waning days of the record industry’s pre-filesharing monomania has any right to.

Sentence 6:
Tricky rebelled against being attached at the hip to a scene he was already looking to shed and decamped for Jamaica to record a more aggressive, bristling-energy mu

In [7]:
# named entity detection
for num, entity in enumerate(parsed_review.ents):
    print('Entity {}:'.format(num+1), entity, '-', entity.label_)
    print('')

Entity 1: today - DATE

Entity 2: Bristol - GPE

Entity 3: 21st century - DATE

Entity 4: Jamaica - GPE

Entity 5: ’96 - GPE

Entity 6: two decades - DATE

Entity 7: two weeks - DATE

Entity 8: Portishead’s - ORG

Entity 9: Beth Gibbons - PERSON

Entity 10: Geoff Barrow - PERSON

Entity 11: The Conversation’s Gene Hackman - WORK_OF_ART

Entity 12: Mezzanine - PERSON

Entity 13: third - ORDINAL

Entity 14: Bristol - GPE

Entity 15: Tricky’s - PERSON

Entity 16: Portishead’s - PERSON

Entity 17: Mezzanine - PERSON

Entity 18: “Inertia Creeps - ORG

Entity 19: two - CARDINAL

Entity 20: four - CARDINAL

Entity 21: Robert “3D - PERSON

Entity 22: Del Naja—is - PERSON

Entity 23: Sarah Jay - PERSON

Entity 24: Risingson - PERSON

Entity 25: Grant “ - PERSON

Entity 26: Marshall - PERSON

Entity 27: Mezzanine - NORP

Entity 28: first - ORDINAL

Entity 29: Horace Andy  - PERSON

Entity 30: three - CARDINAL

Entity 31: early-’70s - CARDINAL

Entity 32: Angel” - PERSON

Entity 33: 1973 - DATE



In [8]:
# token identification

token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

token_df = pd.DataFrame(list(zip(token_text, token_pos)),
            columns=['token_text', 'part_of_speech'])
token_df.head()

Unnamed: 0,token_text,part_of_speech
0,“,PUNCT
1,Trip,NOUN
2,-,PUNCT
3,hop,NOUN
4,”,PUNCT


In [9]:
# text normalization, stemming/lemmatization
# shape analysis

token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

nor_df = pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
                     columns=['token_text', 'token_lemma', 'token_shape'])
nor_df.head(20)

Unnamed: 0,token_text,token_lemma,token_shape
0,“,"""",“
1,Trip,trip,Xxxx
2,-,-,-
3,hop,hop,xxx
4,”,"""",”
5,eventually,eventually,xxxx
6,became,become,xxxx
7,a,a,x
8,’90s,’90s,’ddx
9,punchline,punchline,xxxx


In [10]:
# token-level entity analysis

token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

ent_df = pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
                     columns=['token_text', 'entity_type', 'inside_outside_begin'])
ent_df.head(20)

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,“,,O
1,Trip,,O
2,-,,O
3,hop,,O
4,”,,O
5,eventually,,O
6,became,,O
7,a,,O
8,’90s,,O
9,punchline,,O


In [11]:
token_attributes = [(token.orth_,
                    token.prob,
                    token.is_stop,
                    token.is_punct,
                    token.is_space,
                    token.like_num,
                    token.is_oov)
                   for token in parsed_review]

df1 = pd.DataFrame(token_attributes,
                  columns=['text',
                          'log_proba',
                          'stop?',
                          'punctuation?',
                          'whitespace?',
                          'number?',
                          'out of vocab.?'])

def fill_in(x):
    if x:
        return 'Yes'
    else:
        return 'No'

df1.loc[:, 'stop?':'out of vocab.?'] = (df1.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
df1.head(30)

Unnamed: 0,text,log_proba,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,“,-9.795314,,Yes,,,
1,Trip,-13.672223,,,,,
2,-,-5.468655,,Yes,,,
3,hop,-10.939725,,,,,
4,”,-9.812149,,Yes,,,
5,eventually,-9.494384,,,,,
6,became,-9.81051,Yes,,,,
7,a,-3.929788,Yes,,,,
8,’90s,-18.391684,,,,,
9,punchline,-12.871519,,,,,


In [12]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence



In [13]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [14]:
import os
review_text_path = os.path.join('review_text.txt')

In [15]:
unigram_sentences = os.path.join('unigram_sentences_all.txt')

In [16]:
import codecs

if 0 == 1:
    with codecs.open(unigram_sentences, 'w', encoding='utf-8') as f:
        for sentence in lemmatized_sentence_corpus(review_text_path):
            f.write(sentence + '\n')

In [17]:
unigram_sentences_parser = LineSentence(unigram_sentences)

In [18]:
import itertools as it
for unigram_sentence in it.islice(unigram_sentences_parser, 0, 10):
    print(u' '.join(unigram_sentence))
    print(u'')

trip hop eventually become a ’90s punchline a music press shorthand for overhyped hotel lounge music

but today the much malign subgenre almost feel like a secret precedent

listen to any of the canonical bristol scene album of the mid late ’90s when the genre be start to chafe against its boundary and you’d think the claustrophobic anxious 21st century start a few year ahead of schedule

look at from the right angle trip hop be part of an unbroken chain that run from the abrasion of ’80s post punk to the ruminative pop r&b dance fusion of the moment

the best of it have age far more gracefully and forcefully than anything record in the waning day of the record industry ’s pre filesharing monomania have any right to

tricky rebel against be attach at the hip to a scene he be already look to shed and decamp for jamaica to record a more aggressive bristle energy mutation of his style in ’96 the name pre millennium tension be the only obvious thing that tell you it ’s two decade old rathe

In [19]:
bigram_model_filepath = os.path.join('bigram_model_all')

In [20]:
bigram_model = Phrases(unigram_sentences_parser)
bigram_model.save(bigram_model_filepath)

In [21]:
bigram_model = Phrases.load(bigram_model_filepath)

In [22]:
bigram_sentences_filepath = os.path.join('bigram_sentences_all.txt')

In [23]:
if 1 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf-8') as f:
        for unigram_sentence in unigram_sentences_parser:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')



In [24]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [25]:
for bigram_sentence in it.islice(bigram_sentences, 0, 10):
    print(u' '.join(bigram_sentence))
    print(u'')

trip_hop eventually_become a ’90s punchline a music press shorthand_for overhyped hotel_lounge music

but today the much_malign subgenre almost feel_like a secret precedent

listen to any of the canonical bristol scene album of the mid late_’90s when the genre be start to chafe_against its boundary and you’d_think the claustrophobic anxious 21st_century start a few_year ahead of schedule

look_at from the right_angle trip_hop be part of an unbroken chain that run from the abrasion of ’80s post_punk to the ruminative pop r&b dance fusion of the moment

the best of it have age far more gracefully and forcefully than_anything record in the waning_day of the record industry ’s pre filesharing monomania have any right to

tricky rebel_against be attach at the hip to a scene he be already look to shed and decamp for jamaica to record a more aggressive bristle energy mutation of his style in ’96 the name pre_millennium tension be the only obvious thing that tell you it ’s two_decade old rathe

In [26]:
trigram_model_filepath = os.path.join('trigram_model_all')

In [27]:
trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)

trigram_model = Phrases.load(trigram_model_filepath)

In [28]:
trigram_sentences_filepath = os.path.join('trigram_sentences_all.txt')

In [None]:
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')

In [29]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [30]:
for trigram_sentence in it.islice(trigram_sentences, 0, 10):
    print(u' '.join(trigram_sentence))
    print('')

trip_hop eventually_become a ’90s punchline a music press shorthand_for overhyped hotel_lounge music

but today the much_malign subgenre almost feel_like a secret precedent

listen to any of the canonical bristol scene album of the mid late_’90s when the genre be start to chafe_against its boundary and you’d_think the claustrophobic anxious 21st_century start a few_year ahead of schedule

look_at from the right_angle trip_hop be part of an_unbroken chain that run from the abrasion of ’80s post_punk to the ruminative pop r&b dance fusion of the moment

the best of it have age far_more gracefully and forcefully than_anything record in the waning_day of the record_industry ’s pre filesharing monomania have any right to

tricky rebel_against be attach at the hip to a scene he be already look to shed and decamp for jamaica to record a more_aggressive bristle energy mutation of his style in ’96 the name pre_millennium tension be the only obvious thing that tell_you it ’s two_decade old rathe

In [31]:
trigram_reviews_filepath = os.path.join('trigram_transformed_reviews.txt')

In [58]:
nlp.vocab["'s"].is_stop = True

In [None]:
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf-8') as f:
    for parsed_review in nlp.pipe(line_review(review_text_path), batch_size=10000, n_threads=4):
        unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)]
        
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
        
        trigram_review = [term for term in trigram_review if term not in spacy.en.STOPWORDS]
        
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')



In [None]:
print(u'Original:' +u'\n')

for review in it.islice(line_review(review_text_path), 11, 12):
    print(review)
    
print(u'-------' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf-8') as f:
    for review in it.islice(f, 11, 12):
        print(review)

In [34]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=PendingDeprecationWarning)
    import pyLDAvis
    import pyLDAvis.gensim

    from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import _pickle as pickle

In [35]:
import os
trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict')

In [49]:
trigram_reviews = LineSentence(trigram_reviews_filepath)

trigram_dictionary = Dictionary(trigram_reviews)

trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)

trigram_dictionary.filter_tokens(bad_ids=(u"'", u"'s", u"it_'"))

trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)

trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [50]:
trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm')

In [51]:
def trigram_bow_generator(filepath):
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [52]:
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_reviews_filepath))

trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [53]:
lda_model_filepath = os.path.join('lda_model_all')

In [54]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lda = LdaMulticore(trigram_bow_corpus,
                      num_topics=50,
                      id2word=trigram_dictionary,
                      workers=3)
lda.save(lda_model_filepath)
    
lda = LdaMulticore.load(lda_model_filepath)

In [55]:
def explore_topic(topic_number, topn=25):
       
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))



In [56]:
explore_topic(topic_number=29)

term                 frequency

he_'                 0.003
’s                   0.003
love                 0.003
they_'re             0.002
little               0.002
there_'              0.002
try                  0.002
voice                0.002
pop                  0.002
set                  0.002
hear                 0.002
mix                  0.002
know                 0.002
lyric                0.002
kind                 0.002
sing                 0.002
style                0.002
hip_hop              0.002
rock                 0.002
beat                 0.002
sound_like           0.002
moment               0.002
place                0.001
year                 0.001
turn                 0.001


In [57]:
trigram_dictionary_filepath

'trigram_dict_all.dict'