In [1]:
from collections import defaultdict
from gensim import corpora
from gensim.models import TfidfModel
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents
import numpy as np
import pandas as pd
import random
import re
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/chunk.csv', nrows=100, lineterminator='\n', usecols=['company_name', 'industry', 'text'])

In [3]:
industries = set(df.industry)

In [4]:
def clean(text):
    txt = text
    txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)  # leave only ascii characters
    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\xa0', ' ')  # remove special text operators
    txt = re.sub(' +', ' ', txt)  # remove multiple spaces
    txt = txt[:1000]  # truncate according to text max length
    return txt

In [5]:
def tfidf(df):
    print('making industry documents... ', end='')
    group = df.groupby(['industry'])['text'].apply(lambda x: ','.join(x)).reset_index()
    texts = group.text
    industries = group.industry.to_list()
    texts = [clean(text).split() for text in texts]
    print('done.')
    print('making dictionary... ', end='')
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary.dict')
    print('done.')
    print('making corpus... ', end='')
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('corpus.mm', corpus)
    print('done.')
    return TfidfModel(corpus), corpus, dictionary, industries

In [6]:
model, corpus, dictionary, industries = tfidf(df)

making industry documents... done.
making dictionary... done.
making corpus... done.


In [None]:
dictionary

In [265]:
[c[0] for c in corpus[industry_to_index['construction']]]

[0,
 29,
 40,
 41,
 48,
 60,
 65,
 69,
 73,
 84,
 86,
 102,
 104,
 111,
 118,
 123,
 176,
 298,
 387,
 438,
 440,
 538,
 542,
 547,
 601,
 608,
 647,
 663,
 757,
 845,
 855,
 877,
 898,
 899,
 900,
 901,
 902,
 903,
 904,
 905,
 906,
 907,
 908,
 909,
 910,
 911,
 912,
 913,
 914,
 915,
 916,
 917,
 918,
 919,
 920,
 921,
 922,
 923,
 924,
 925,
 926,
 927,
 928,
 929,
 930,
 931,
 932,
 933,
 934,
 935,
 936,
 937,
 938,
 939,
 940,
 941,
 942,
 943,
 944,
 945,
 946,
 947,
 948,
 949,
 950,
 951,
 952,
 953,
 954,
 955,
 956,
 957,
 958,
 959]

In [287]:
[dictionary[d] for d in random.sample(list(dictionary), 1000) if dictionary[d] in [c[0] for c in corpus[industry_to_index['construction']]]]

[]

In [7]:
industry_to_index = {ind: i for i, ind in enumerate(industries)}

In [217]:
ind_idx = industry_to_index['computer software']
words, probs = [w[0] for w in model[corpus[ind_idx]]], [p[1] for p in model[corpus[ind_idx]]]

In [225]:
k = int(200 - random.uniform(0, 180**(2/3))**(3/2))
print(f'{k=}')
top_k_words = random.choices(words, weights=probs, k=k)
text = ' '.join([dictionary[w] for w in top_k_words])
text

k=169


"Money Started Toggle Skip helping Toggle As stamps When deals LoyalFree 24, Kno good Money Started purchases. now Deals, App time loyalty deals, App Trails Started Soph Money Deals, Download Saving Soph AS Saving app Money LoyalFree Competitions User BIDs, LoyalFree trails Competitions Cities Started LoyalFree Save Deals Win 24, collect Works collect A donates LoyalFree collect Toggle SAVE discover Save Charity BIDs, deals, From Work? You How 1p navigation are 1p Exclusive As UK'S Every Every Well Save Competitions exclusive Time Download You LoyalFree LoyalFree exciting prizes Every Time loyalty When Local prizes YOU LoyalFree Download app stamp LoyalFree LoyalFree Stamp As See LoyalFree User Work? Stamp Enter You Work? LoyalFree On: See Click Soph Feel using LoyalFree Soph charity Deals, deals Win Exclusive Trails app Events Every Enter Click deals, Stamp Kno loyalty when using Local Collect LoyalFree App Cities Started about events 24, events Enjoy Shop schemes charity Skip when st

In [227]:
def generate_tfidf(industry):
    ind_idx = industry_to_index[industry]
    words, probs = [w[0] for w in model[corpus[ind_idx]]], [p[1] for p in model[corpus[ind_idx]]]
    k = int(200 - random.uniform(0, 180**(2/3))**(3/2))
    top_k_words = random.choices(words, weights=probs, k=k)
    text = ' '.join([dictionary[w] for w in top_k_words])
    return text

In [229]:
df['tfidf_text'] = df.industry.apply(lambda ind: generate_tfidf(ind))

In [230]:
df

Unnamed: 0,text,industry,company_name,tfidf_text
0,Maryland-based construction and restoration co...,construction,freedom-restoration1,follow follow Structural Renovation resource 2...
1,Treatment Homes Inc Skip to content Donate Abo...,mental health care,treatment-homes-inc,Do? Can Starfish Homes Funding Parenting Fundi...
2,Leif | Outcomes-Based Education Finance Platfo...,civic & social organization,longterm-education-investment-fund,Increase fixed AgreementsLeif once access stud...
3,Coachs sportifs et coachs de vie - Pour 3 Poin...,civic & social organization,pour-3-points,result fixed Remove alternative fixed upfront ...
4,MMB Healthcare Companies Leadership Contact ...,pharmaceuticals,evergreen-pharma-services-llc,"MMB Teoranta, medical health manufacturing seg..."
...,...,...,...,...
95,"Patriot Properties - Real Estate Investment, P...",real estate,patriotholdings,"Properties HomeWhy Investment, Estate Shopping..."
96,Liberty Tutoring - 1-on-1 Personalised Tutorin...,education management,liberty-tutoring,Columbus message * submitting First Interest E...
97,Hair Fusion | Best Hair Salon in Bakersfield |...,consumer services,hair-fusion,FAQ Bee Installation Total Book AC AC Maintena...
98,Daughters of Hawai‘i Member Resources Queen Em...,museums and institutions,daughters-of-hawaii-gift-shop,risk ventilation A MONDAY GroupsContact Tutank...


In [3]:
%cd ../project

/home/student/project


In [4]:
from band_generator import BandGenerator

bg = BandGenerator('models/gpt2_forward_model', None, 'data/artists_blacklist.pickle', 'data/genres.pickle')



In [12]:
gen, stats = bg.generate_bands(generation_args=dict(top_k=300, num_return_sequences=12, max_length=1024, do_sample=True, temperature=1.75))
gen

 57%|█████▋    | 57/100 [03:26<02:35,  3.62s/it]


[GeneratedBand(band='the swathe boys', genre='Rock', lyrics="Some love each chance down\nLove within a new kind\nHave felt so helpless they both sink below me\n\nOn all green lands [see below] ever wonder who your dreaming boy [see below] looksin at on this life or\nAre, aren't thinking both feel me falling with\nThe next of came\n\n'Cause there no secrets that close can destroy the master one secret thing about the waiting [tell the realin's baby lay there lying on our mignot where are you lovers?]. it had me way above though' through\nThere a secret [tell the realin'm real up?] wasn't there any secret, she see she took all i'm feeling. wait there waited the joy died feel and finally died homein home again.\nHold me true 'not that you'll know] right you know.. You know dont all touch each door my lord must surely not hear while tears. tell real bad\nTell real glad wight aah! uh real ye ye jah wude ye all made no believe for i [do I think?] more believein another hope still makes so ma

In [13]:
stats

GenerationStats(num_iterations=10, num_items_considered=120, num_failed_match=21, num_blacklist_filtered=42, num_seen_filtered=0, num_genre_filter=0, num_short_texts=0, num_text_missing_band=0, num_user_filtered=0, num_returned=57, wall_time=206.43568778038025)

In [8]:
print("(Ron Wood)\nStrange for some time, maybe now\nSame old story\nKindle I could see you're older than he does.\nEach day you'd lie there by his side.\nAlthough the stories have changed, that's alright.\nIt's all just a love story and you leave him blind.\nIt's not the same if you're older and the world wears thin.\nThere's nights that you'll wish you'd never had to go.\nBut you're never there to see them, and you're never there to hold them.\nSo right here now, it's time to lead the way.\nYou said you never knew I loved you at all.\nYou took one look at my eyes. Then he snapped that silence right out of your head,\nNow there it is that you're out there, right where he's hiding just for you.\nBut there is one thing that cannot go unheard, and there is more to the story if you want.\nIt's all just a love story and you leave him blind.\nIt's not the same if you're older and the world wears thin..")

(Ron Wood)
Strange for some time, maybe now
Same old story
Kindle I could see you're older than he does.
Each day you'd lie there by his side.
Although the stories have changed, that's alright.
It's all just a love story and you leave him blind.
It's not the same if you're older and the world wears thin.
There's nights that you'll wish you'd never had to go.
But you're never there to see them, and you're never there to hold them.
So right here now, it's time to lead the way.
You said you never knew I loved you at all.
You took one look at my eyes. Then he snapped that silence right out of your head,
Now there it is that you're out there, right where he's hiding just for you.
But there is one thing that cannot go unheard, and there is more to the story if you want.
It's all just a love story and you leave him blind.
It's not the same if you're older and the world wears thin..
