In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import feather
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from multiprocessing import Pool
from nltk.stem import WordNetLemmatizer
import json

In [2]:
dat = pickle.load(open("./grouped_w_lang.pkl", 'rb'))

In [3]:
dat = dat.query('lang == "en"')

In [4]:
stops = set(stopwords.words('english'))

In [None]:
# from IPython import display
# tokens = []
# w = 0
# for t in dat['title']:
#     if t is None:
#         tokens += [[]]
#     else:
#         tokens += [word_tokenize(t)]
#     display.clear_output(wait=True)
#     w += 1
#     print("at sentence {0}".format(w))    

def cond_tokenize(t):
    if t is None:
        return []
    else:
        return word_tokenize(t)

p = Pool(8)
tokens = list(p.imap(cond_tokenize, dat['title']))
p.close()

In [5]:
# pickle.dump(tokens, open("tokens_larger.pkl", "wb"))
tokens = pickle.load(open("tokens_larger.pkl", "rb"))

In [6]:
lem = WordNetLemmatizer()

In [7]:
pure_tokens = [" ".join([lem.lemmatize(w.lower()) for w in sent if w.lower() not in stops and re.match("\w+", w)])\
               for sent in tokens]

In [8]:
pure_tokens[:10]

['carlos santana present cindy blackman mexico city',
 "taoist monk tian xin brazil wudang sanfeng t'ai chi exercise",
 'makeup guy actor tv host camera expert',
 'pet rescue saga level 539 2 star boost',
 'concealed conclusion stage 2c theme western child thc version',
 'grfs goblin gun set review',
 'get free money diamond hayday game gem clash clan',
 'lovely bone 2009 um olhar paraíso behind scene',
 'kingdom hearts- behemoth expert',
 'grand theft auto 5 walkthrough info feedback needed']

### Test

In [9]:
vectorizer = TfidfVectorizer(min_df=1, max_features=5000)
tdidf = vectorizer.fit_transform(pure_tokens)

In [11]:
tsvd = TruncatedSVD(n_components=500)

In [12]:
fitted = tsvd.fit_transform(tdidf)

In [14]:
component_maxes = np.apply_along_axis(np.argmax, 1, tsvd.components_)

In [15]:
rev = {v:k for k, v in vectorizer.vocabulary_.items()}

In [16]:
print([rev[x] for x in component_maxes])

['video', 'part', 'part', 'trailer', 'live', 'review', 'dance', 'new', 'new', 'hd', '2012', '2013', 'world', '2014', '2011', 'love', 'de', 'tutorial', 'day', 'school', 'game', '2010', 'dj', 'game', 'show', 'minecraft', 'make', 'song', 'music', 'guitar', 'one', 'highlight', 'war', 'play', 'com', 'gameplay', 'band', 'best', '2009', 'best', 'first', 'final', '2015', '2015', 'gta', 'christmas', 'unboxing', 'wmv', 'call', 'episode', 'get', 'life', 'life', 'test', 'piano', 'super', '12', 'night', 'tour', 'time', 'hair', 'city', 'car', 'concert', 'battle', 'movie', 'iphone', 'movie', 'tribute', 'tv', 'tv', 'go', 'feat', 'rock', '11', 'la', 'speed', 'year', 'man', 'ft', 'heart', 'girl', 'free', 'story', 'red', '15', 'mod', 'interview', 'house', 'theme', 'house', 'version', 'star', 'theme', 'vs', 'club', 'blue', 'park', 'montage', 'remix', 'king', 'art', '13', '13', 'football', 'boy', 'fire', 'football', 'road', 'wedding', 'light', 'walkthrough', 'full', '2007', 'back', 'home', 'style', 'style'

In [17]:
np.sum(tsvd.explained_variance_ratio_)

0.39506954546126372

In [30]:
# for i in range(20):
#     print([rev[x] for x in tsvd.components_[i].argsort()[-10:][::-1]])

In [19]:
feature_names = range(500)
fs, ps = f_regression(fitted, np.log(dat['views']), center=False)
reg = LinearRegression(n_jobs=4)
reg.fit(fitted, np.log(dat['views']))
views_stat = pd.DataFrame({"feature_id": feature_names,
                          "coef": reg.coef_, 
                          "p": ps}).sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]

In [20]:
views_stat.query('p < 0.01').head(10)

Unnamed: 0,feature_id,coef,p
0,0,1.438066,0.0
15,15,1.407569,3.049929e-103
129,129,1.406193,3.70673e-144
168,168,1.38011,6.338185e-43
17,17,1.281349,2.091352e-133
44,44,1.196041,0.0
9,9,1.16019,0.0
292,292,1.151984,6.796401000000001e-160
367,367,1.093176,8.070253e-09
79,79,1.047609,3.023156e-10


In [21]:
views_stat.query('p < 0.01').tail(10)

Unnamed: 0,feature_id,coef,p
12,12,-0.872305,0.0
25,25,-0.949599,9.470310999999998e-38
11,11,-0.967625,0.0
206,206,-0.981528,3.8618679999999996e-192
321,321,-0.984397,8.204481e-06
320,320,-0.995474,1.133607e-45
189,189,-1.041085,3.306457e-11
43,43,-1.120607,4.719222e-41
10,10,-1.194272,0.0
38,38,-1.321328,1.976512e-18


In [22]:
ranks = views_stat.query('p < 0.01')
for i in range(10):
    print([rev[x] for x in tsvd.components_[ranks['feature_id'].iloc[i]].argsort()[-10:][::-1]])

['video', 'music', 'official', 'hd', 'trailer', 'live', 'part', 'new', 'lyric', '2013']
['love', 'song', 'tutorial', 'dj', '2012', 'minecraft', 'world', '2014', 'story', 'make']
['lyric', 'big', 'demo', 'little', 'dream', 'clip', 'album', 'sound', '14', 'music']
['halo', 'moment', 'funny', 'look', 'battlefield', 'pack', 'kid', 'opening', 'like', 'reach']
['tutorial', 'minecraft', 'make', 'de', 'day', 'mod', 'live', 'play', 'makeup', 'hair']
['gta', 'car', 'city', 'get', 'online', 'mod', 'san', 'black', 'free', 'andreas']
['hd', 'new', 'dance', 'gameplay', 'live', 'black', '1080p', 'ops', 'full', 'pc']
['install', 'moon', 'fly', 'champion', 'stop', 'magic', 'got', 'fan', 'child', 'use']
['bird', 'head', 'hq', 'winter', 'young', 'bmw', 'session', 'epic', 'angry', 'ost']
['ft', 'feat', 'girl', 'boy', 'version', 'free', 'de', 'lyric', 'album', 'full']


In [23]:
for i in range(10):
    print([rev[x] for x in tsvd.components_[ranks['feature_id'].iloc[-(i+1)]].argsort()[-10:][::-1]])

['2009', 'final', 'fantasy', 'christmas', 'battle', 'tour', 'piano', '12', 'one', '2008']
['2012', 'new', 'hd', 'part', 'review', 'cover', '2011', 'official', 'highlight', 'school']
['2015', 'video', 'tour', '2009', 'trailer', 'team', 'play', 'test', 'lyric', 'wmv']
['battlefield', 'class', 'good', 'bad', 'championship', 'kid', 'track', 'modern', 'warfare', 'company']
['mountain', 'london', 'airport', 'international', 'landing', '18', 'beat', 'green', 'bike', 'tank']
['spring', 'server', 'mountain', 'pvp', 'sport', 'book', 'stage', 'wow', 'warrior', 'ball']
['rally', 'ride', 'mp4', 'last', 'championship', 'crash', 'round', 'preview', 'cup', 'gold']
['2013', '2014', 'highlight', 'part', 'hd', 'cover', 'new', 'show', 'review', 'school']
['minecraft', '2010', 'highlight', 'make', 'mod', 'love', 'cover', 'drum', 'video', 'black']
['world', '2014', 'hd', '2011', 'review', 'cup', 'warcraft', 'part', 'cover', 'new']


In [28]:
ranks = views_stat.query('p < 0.01')
for i in range(10):
    sf = tsvd.components_[ranks['feature_id'].iloc[i]]
    tot = np.sum(sf)
    cum = 0
    rev_sort = sf.argsort()[::-1]
    j = 0
    while cum < 0.9 * tot:
        cum += sf[rev_sort[j]]
        j += 1
    l = [rev[x] for x in rev_sort[:(j+1)]]
    print(l[:min(len(l), 20)])

['video', 'music', 'official', 'hd', 'trailer', 'live', 'part', 'new', 'lyric', '2013', '2012', '2014', 'love', 'review', 'dance', 'ft', 'cover', 'feat', 'game', 'world']
['love', 'song']
['lyric', 'big', 'demo', 'little', 'dream', 'clip', 'album', 'sound', '14', 'music', 'sonic', 'fishing', '2007', 'full', 'english', 'hero', 'style', 'track', 'john', 'air']
['halo', 'moment', 'funny', 'look', 'battlefield', 'pack']
['tutorial', 'minecraft', 'make']
['gta', 'car', 'city', 'get', 'online', 'mod', 'san', 'black', 'free', 'andreas', 'test', 'christmas', 'iv', 'super', 'episode', 'life', 'funny', '2008', 'story', 'moment']
['hd']
['install', 'moon', 'fly', 'champion', 'stop', 'magic', 'got', 'fan', 'child', 'use', 'ultimate', 'fashion', 'spring', 'motion', 'face', 'ca', 'mountain', 'using', 'lesson', 'original']
['bird', 'head', 'hq']
['ft', 'feat']


In [29]:
ranks = views_stat.query('p < 0.01')
for i in range(10):
    sf = tsvd.components_[ranks['feature_id'].iloc[-(i+1)]]
    tot = np.sum(sf)
    cum = 0
    rev_sort = sf.argsort()[::-1]
    j = 0
    while cum < 0.9 * tot:
        cum += sf[rev_sort[j]]
        j += 1
    l = [rev[x] for x in rev_sort[:(j+1)]]
    print(l[:min(len(l), 20)])

['2009', 'final', 'fantasy', 'christmas', 'battle']
['2012']
['2015']
['battlefield', 'class', 'good', 'bad', 'championship', 'kid']
['mountain', 'london', 'airport', 'international', 'landing', '18', 'beat', 'green', 'bike', 'tank', 'www', 'ipad']
['spring', 'server', 'mountain', 'pvp', 'sport']
['rally', 'ride', 'mp4', 'last', 'championship', 'crash', 'round', 'preview', 'cup', 'gold', 'match', 'glitch', 'dead', 'island', 'warfare', 'modern', 'racing', 'national', 'kid', 'god']
['2013']
['minecraft']
['world']


### Bigrams and beyond

In [9]:
vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(1, 3))
tdidf = vectorizer.fit_transform(pure_tokens)
tsvd = TruncatedSVD(n_components=100)
fitted = tsvd.fit_transform(tdidf)
rev = {v:k for k, v in vectorizer.vocabulary_.items()}

In [10]:
np.sum(tsvd.explained_variance_ratio_)

0.12557242673663371

In [11]:
feature_names = range(100)
fs, ps = f_regression(fitted, np.log(dat['views']), center=False)
reg = LinearRegression(n_jobs=4)
reg.fit(fitted, np.log(dat['views']))
views_stat = pd.DataFrame({"feature_id": feature_names,
                          "coef": reg.coef_, 
                          "p": ps}).sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]

In [23]:
ranks = views_stat.query('p < 0.01')
for i in range(10):
    sf = tsvd.components_[ranks['feature_id'].iloc[i]]
    tot = np.sum(sf ** 2)
    cum = 0
    rev_sort = sf.argsort()[::-1]
    j = 0
    while cum < 0.9 * tot:
        cum += sf[rev_sort[j]] ** 2
        j += 1
    l = [rev[x] for x in rev_sort[:(j+1)]]
    print(l[:min(len(l), 20)])

['minecraft', 'de', 'love', 'tutorial', 'make', '2013', 'dj', 'mod', 'play', 'song', 'la', 'piano', 'com', 'remix', 'wmv', 'hair', 'makeup', 'let', 'get', 'feat']
['trailer', 'cover', 'part', 'guitar', 'official', 'guitar cover', 'official trailer', 'hd', 'drum', 'drum cover', 'movie', 'trailer hd', 'piano', 'piano cover', 'movie trailer', 'official video', 'official trailer hd', 'gameplay', 'walkthrough', 'theatrical']
['직캠', '東方', 'ᴴᴰ', 'обзор', 'на', 'для', 'zx']
['10', 'top', 'make', 'top 10', 'song', 'guitar', 'official', 'de', '11', 'guitar cover', '12', 'review', 'official music', 'official music video', 'official video', 'school', 'world', 'week', '10 10', 'high']
['rock', 'heart', 'go', 'feat', 'drum', 'top', 'kingdom', 'piano', 'kingdom heart', 'drum cover', 'theme', 'hero', 'guitar', 'pro', 'house', 'version', 'back', 'full', 'amv', 'solo']
['test', 'car', 'hair', 'life', 'night', 'team', 'drive', 'top', 'speed', 'natural', 'test drive', 'highlight', 'gameplay', 'natural hai

In [25]:
ranks = views_stat.query('p < 0.01')
for i in range(10):
    sf = tsvd.components_[ranks['feature_id'].iloc[-(i+1)]]
    tot = np.sum(sf ** 2)
    cum = 0
    rev_sort = (sf ** 2).argsort()[::-1]
    j = 0
    while cum < 0.9 * tot:
        cum += sf[rev_sort[j]] ** 2
        j += 1
    l = [rev[x] for x in rev_sort[:(j+1)]]
    print(l[:min(len(l), 20)])

['2012', 'hd', '2014', '2013', 'school', 'black', 'trailer', 'high', 'world', 'minecraft']
['2011', '2014', 'de', 'minecraft', 'tutorial', 'make', 'world', '2012', 'dj', '2010', 'love', 'game', 'la', 'play', 'hd', 'live']
['2009', 'gameplay', 'gta', 'war', 'best', 'team', 'star', 'wmv', '2015', 'car', 'final', 'call', 'first', 'get', 'duty', 'call duty', 'one', 'online', 'game', 'star war']
['band', '2009', 'call', 'duty', 'call duty', 'black', 'gameplay', 'rock', 'song', 'show', 'highlight', 'live', 'one', 'battle', 'final', 'black ops', 'ops', '2010', 'piano', '2008']
['2013', 'school', '2011', 'high', 'hd', 'minecraft', 'high school', 'tutorial', '2012', 'make', 'love', 'black']
['band', 'call', 'duty', 'call duty', 'black', 'war', 'star', 'gameplay', '2009', 'best', 'black ops', 'star war', 'ops', 'warfare', 'ghost', 'gta', 'modern', 'rock', '2015', 'modern warfare']
['2011', 'day', 'world', 'de', 'school', 'high', '10', 'love', 'high school', 'highlight', 'game', 'live', 'top', '2

### Prepare for output

In [41]:
pos = []
for i in range(20):
    pos += [[rev[x] for x in tsvd.components_[i].argsort()[-10:][::-1]]]

In [42]:
pos_coef = []
for i in range(20):
    pos_coef += [[x for x in 
           tsvd.components_[i][(tsvd.components_[i] ** 2).argsort()[-10:][::-1]]]]

In [46]:
pos_words = pd.DataFrame(pos).stack()

In [47]:
pos_coefs = pd.DataFrame(pos_coef).stack()

In [94]:
pos_df = pd.DataFrame({'name': pos_words, 'size': pos_coefs}).reset_index()\
    .rename(columns={'level_0': 'component', 'level_1': 'order'})
pos_df['comp_perc'] = tsvd.explained_variance_ratio_[pos_df['component']]
pos_df.head()

Unnamed: 0,component,order,name,size,comp_perc
0,0,0,직캠,0.583569,0.004503
1,0,1,東方,0.460416,0.004503
2,0,2,ᴴᴰ,0.409127,0.004503
3,0,3,обзор,0.333282,0.004503
4,0,4,на,0.227081,0.004503


In [101]:
cList = []
groupDict = pos_df.groupby('component').apply(lambda g: g.drop('component', axis=1)\
                                                .to_dict(orient='records')).to_dict()
def remove_key(d, k):
    r = d
    del r[k]
    return r

for key, value in groupDict.items():
    cList.append(dict(name=str(key), size=value[0]['comp_perc'], 
                      children=list([remove_key(d, 'comp_perc') for d in value])))
finalJSON = dict(name='component', children=cList)
finalJSONs = json.dumps(finalJSON)

In [102]:
finalJSON

{'children': [{'children': [{'name': '직캠',
     'order': 0,
     'size': 0.5835688684436965},
    {'name': '東方', 'order': 1, 'size': 0.4604163355986042},
    {'name': 'ᴴᴰ', 'order': 2, 'size': 0.40912675168141216},
    {'name': 'обзор', 'order': 3, 'size': 0.33328244105944127},
    {'name': 'на', 'order': 4, 'size': 0.22708087562960658},
    {'name': 'для', 'order': 5, 'size': 0.2260788840629073},
    {'name': 'zx', 'order': 6, 'size': 0.10518902177776925},
    {'name': 'zumba', 'order': 7, 'size': 0.07161322409773604},
    {'name': 'zouk', 'order': 8, 'size': 0.05771191368255122},
    {'name': 'zoom', 'order': 9, 'size': 0.04446930955924287}],
   'name': '0',
   'size': 0.004503262470385291},
  {'children': [{'name': 'part', 'order': 0, 'size': 0.5004755618212557},
    {'name': 'live', 'order': 1, 'size': 0.28056436850621375},
    {'name': 'cover', 'order': 2, 'size': 0.20109360637310356},
    {'name': 'hd', 'order': 3, 'size': 0.1991901718505181},
    {'name': 'trailer', 'order': 4, 

In [100]:
json.dump(finalJSON, open("./word_cloud/lsa.json", "w"))

### Streamline

In [11]:
vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(2, 2))
tdidf = vectorizer.fit_transform(np.array(pure_tokens))
tsvd = TruncatedSVD(n_components=500)
fitted = tsvd.fit_transform(tdidf)
rev = {v:k for k, v in vectorizer.vocabulary_.items()}

feature_names = range(500)
fs, ps = f_regression(fitted, np.log(dat['views']), center=False)
reg = LinearRegression(n_jobs=4)
reg.fit(fitted, np.log(dat['views']))
views_stat = pd.DataFrame({"feature_id": feature_names,
                          "coef": reg.coef_, 
                          "p": ps}).query('p < 0.01')\
    .sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]

sid = views_stat[:20]['feature_id']    
    
pos = []
for i in range(20):
    pos += [[rev[x] for x in (tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]

pos_coef = []
for i in range(20):
    pos_coef += [[x for x in 
           tsvd.components_[sid][i][(tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]]

pos_words = pd.DataFrame(pos).stack()
pos_coefs = pd.DataFrame(pos_coef).stack()
pos_df = pd.DataFrame({'name': pos_words, 'size': pos_coefs}).reset_index()\
    .rename(columns={'level_0': 'component', 'level_1': 'order'})
pos_df['comp_perc'] = tsvd.explained_variance_ratio_[sid][pos_df['component']]

cList = []
groupDict = pos_df.groupby('component').apply(lambda g: g.drop('component', axis=1)\
                                                .to_dict(orient='records')).to_dict()
def remove_key(d, k):
    r = d
    del r[k]
    return r

for key, value in groupDict.items():
    cList.append(dict(name=str(key), size=value[0]['comp_perc'], 
                      children=list([remove_key(d, 'comp_perc') for d in value])))
finalJSON = dict(name='component', children=cList)

json.dump(finalJSON, open("./word_cloud/lsa_fame_all.json", "w"))

In [12]:
for n in range(11):

    sub_index = dat[str(n)] == 1
    vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(2, 2))
    tdidf = vectorizer.fit_transform(np.array(pure_tokens)[np.where(sub_index)[0]])
    tsvd = TruncatedSVD(n_components=500)
    fitted = tsvd.fit_transform(tdidf)
    rev = {v:k for k, v in vectorizer.vocabulary_.items()}

    feature_names = range(500)
    fs, ps = f_regression(fitted, np.log(dat['views'][sub_index]), center=False)
    reg = LinearRegression(n_jobs=4)
    reg.fit(fitted, np.log(dat['views'][sub_index]))
    views_stat = pd.DataFrame({"feature_id": feature_names,
                              "coef": reg.coef_, 
                              "p": ps}).query('p < 0.01')\
    .sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]
    
    sid = views_stat[:20]['feature_id']

    pos = []
    for i in range(20):
        pos += [[rev[x] for x in (tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]

    pos_coef = []
    for i in range(20):
        pos_coef += [[x for x in 
               tsvd.components_[sid][i][(tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]]

    pos_words = pd.DataFrame(pos).stack()
    pos_coefs = pd.DataFrame(pos_coef).stack()
    pos_df = pd.DataFrame({'name': pos_words, 'size': pos_coefs}).reset_index()\
        .rename(columns={'level_0': 'component', 'level_1': 'order'})
    pos_df['comp_perc'] = tsvd.explained_variance_ratio_[sid][pos_df['component']]

    cList = []
    groupDict = pos_df.groupby('component').apply(lambda g: g.drop('component', axis=1)\
                                                    .to_dict(orient='records')).to_dict()
    def remove_key(d, k):
        r = d
        del r[k]
        return r

    for key, value in groupDict.items():
        cList.append(dict(name=str(key), size=value[0]['comp_perc'], 
                          children=list([remove_key(d, 'comp_perc') for d in value])))
    finalJSON = dict(name='component', children=cList)

    json.dump(finalJSON, open("./word_cloud/lsa_fame_{0}.json".format(n), "w"))

### Shame

In [13]:
vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(2, 2))
tdidf = vectorizer.fit_transform(np.array(pure_tokens))
tsvd = TruncatedSVD(n_components=500)
fitted = tsvd.fit_transform(tdidf)
rev = {v:k for k, v in vectorizer.vocabulary_.items()}

feature_names = range(500)
fs, ps = f_regression(fitted, np.log(dat['views']), center=False)
reg = LinearRegression(n_jobs=4)
reg.fit(fitted, np.log(dat['views']))
views_stat = pd.DataFrame({"feature_id": feature_names,
                          "coef": reg.coef_, 
                          "p": ps}).query('p < 0.01')\
    .sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]

sid = views_stat[-20:]['feature_id']    
    
pos = []
for i in range(20):
    pos += [[rev[x] for x in (tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]

pos_coef = []
for i in range(20):
    pos_coef += [[x for x in 
           tsvd.components_[sid][i][(tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]]

pos_words = pd.DataFrame(pos).stack()
pos_coefs = pd.DataFrame(pos_coef).stack()
pos_df = pd.DataFrame({'name': pos_words, 'size': pos_coefs}).reset_index()\
    .rename(columns={'level_0': 'component', 'level_1': 'order'})
pos_df['comp_perc'] = tsvd.explained_variance_ratio_[sid][pos_df['component']]

cList = []
groupDict = pos_df.groupby('component').apply(lambda g: g.drop('component', axis=1)\
                                                .to_dict(orient='records')).to_dict()
def remove_key(d, k):
    r = d
    del r[k]
    return r

for key, value in groupDict.items():
    cList.append(dict(name=str(key), size=value[0]['comp_perc'], 
                      children=list([remove_key(d, 'comp_perc') for d in value])))
finalJSON = dict(name='component', children=cList)

json.dump(finalJSON, open("./word_cloud/lsa_shame_all.json", "w"))

In [14]:
for n in range(11):

    sub_index = dat[str(n)] == 1
    vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(2, 2))
    tdidf = vectorizer.fit_transform(np.array(pure_tokens)[np.where(sub_index)[0]])
    tsvd = TruncatedSVD(n_components=500)
    fitted = tsvd.fit_transform(tdidf)
    rev = {v:k for k, v in vectorizer.vocabulary_.items()}

    feature_names = range(500)
    fs, ps = f_regression(fitted, np.log(dat['views'][sub_index]), center=False)
    reg = LinearRegression(n_jobs=4)
    reg.fit(fitted, np.log(dat['views'][sub_index]))
    views_stat = pd.DataFrame({"feature_id": feature_names,
                              "coef": reg.coef_, 
                              "p": ps}).query('p < 0.01')\
    .sort_values('coef', ascending=False)[['feature_id', 'coef', 'p']]
    
    sid = views_stat[-20:]['feature_id']

    pos = []
    for i in range(20):
        pos += [[rev[x] for x in (tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]

    pos_coef = []
    for i in range(20):
        pos_coef += [[x for x in 
               tsvd.components_[sid][i][(tsvd.components_[sid][i] ** 2).argsort()[-10:][::-1]]]]

    pos_words = pd.DataFrame(pos).stack()
    pos_coefs = pd.DataFrame(pos_coef).stack()
    pos_df = pd.DataFrame({'name': pos_words, 'size': pos_coefs}).reset_index()\
        .rename(columns={'level_0': 'component', 'level_1': 'order'})
    pos_df['comp_perc'] = tsvd.explained_variance_ratio_[sid][pos_df['component']]

    cList = []
    groupDict = pos_df.groupby('component').apply(lambda g: g.drop('component', axis=1)\
                                                    .to_dict(orient='records')).to_dict()
    def remove_key(d, k):
        r = d
        del r[k]
        return r

    for key, value in groupDict.items():
        cList.append(dict(name=str(key), size=value[0]['comp_perc'], 
                          children=list([remove_key(d, 'comp_perc') for d in value])))
    finalJSON = dict(name='component', children=cList)

    json.dump(finalJSON, open("./word_cloud/lsa_shame_{0}.json".format(n), "w"))