In [1]:
import json
from collections import Counter, defaultdict
import scipy.sparse as sp
import numpy as np
import pandas as pd
import nltk
import re
import os
import copy
import codecs
from sklearn import feature_extraction
import string

In [58]:
stopwords = set(nltk.corpus.stopwords.words('english'))
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [10]:
def load_df_from_json():
    cwd = os.getcwd()
    frames = []
    path = 'lyrics_json_09-18/'
    for file in os.listdir(path):
        date = file.split('.')[0].split('_')[-1]
        with open(path + file, 'r') as f:
            df = pd.DataFrame(json.load(f)).dropna()
            df['time'] = pd.Timestamp(date)
            frames.append(df)
    raw_data = pd.concat(frames)
    return raw_data

In [51]:
def preprocess(lyric):
    lyric = lyric.lower()
    lyric = re.sub(r'\[.*?\]', '', lyric)  # remove [*] pattern
    lyric = lyric.replace("'s", '')
    lyric = lyric.replace("'ve", '')
    lyric = lyric.replace("'", '')  # ' must be ignored
    lyric = lyric.replace("-", ' ')

    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    lyric = lyric.translate(translator)
#     tokens = nltk.word_tokenize(lyric)

    return lyric #[w for w in tokens if w not in stopwords]

def words2sentence(words):
    return ' '.join(words)

In [55]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
raw_data = load_df_from_json()
billboard = copy.deepcopy(raw_data)
# print(billboard)

In [65]:
# billboard['lyrics'] = billboard['lyrics'].apply(lambda x: preprocess(x))
# # billboard['lyrics'] = billboard['lyrics'].apply(lambda x: [w for w in x if w not in stopwords])
# # billboard['lyrics'] = billboard['lyrics'].apply(lambda x: words2sentence(x))
# print(billboard.head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(billboard['lyrics']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()
print(terms)