In [1]:
import json
import spacy
import nltk

## Working directly with raw data from Pushshift.io
This would require writing functions to "query" by itself

In [2]:
data = []
with open('RC_2019-11_10') as f:
    for line in f:
        data.append(json.loads(line))

In [5]:
data[0]

{'all_awardings': [],
 'associated_award': None,
 'author': 'SpookySpino',
 'author_created_utc': None,
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_3ok0x0gk',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'body': "Mario and Luigi sound so happy and wholesome in this game I love it :')",
 'can_gild': True,
 'can_mod_post': False,
 'collapsed': False,
 'collapsed_because_crowd_control': None,
 'collapsed_reason': None,
 'controversiality': 0,
 'created_utc': 1572566400,
 'distinguished': None,
 'edited': False,
 'gilded': 0,
 'gildings': {},
 'id': 'f5z3pn8',
 'is_submitter': False,
 'link_id': 't3_dptjc7',
 'locked': False,
 'no_follow': True,
 'parent_id': 't3_dptjc7',
 'permalink': '/r/LuigisMansion/comments/dptjc7/only_a_minute_in_and_im_in_love_wi

In [7]:
comment = "I have the HD668B's from superlux, the audio is fantastic , they're really comfy and only £30"
comment = "I have the HD668B from Google"
comment = "Apple is looking at buying U.K. startup for $1 billion"

In [8]:
nlp = spacy.load("en_core_web_sm")

In [109]:
doc = nlp(comment)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)

Apple 0 5 383 ORG
U.K. 27 31 384 GPE
$1 billion 44 54 394 MONEY


## Querying the pushshift api:

In [10]:
import requests
page = requests.get('http://api.pushshift.io/reddit/search/submission/?subreddit=headphones&size=250&q=advice&after=7d')
# Default is 25, max is 100
# Default is in the last n comments
# query (q) is case-insensitive, can be used to match two different words e.g. &q=radiohead+band
# multiple subreddits e.g. ?subreddit=headphones,headphoneadvice
# Sorting can be done by score or date e.g. sort=score:desc

# can also filter for certain fields e.g. &filter=author,score,subreddit

# Advanced: using subreddit aggregation allows us to determine the top subreddits
# mentioning a certain word. Example using "trump"
# https://api.pushshift.io/reddit/search/comment/?q=trump&after=7d&aggs=subreddit&size=0

In [11]:
data = json.loads(page.text)
print(len(data['data']))

30


In [12]:
i = 3
data['data'][i]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'YTRoseRocket',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_2h2oydw5',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1600278254,
 'domain': 'self.headphones',
 'full_link': 'https://www.reddit.com/r/headphones/comments/iu0hl2/looking_for_advice_on_headphones/',
 'gildings': {},
 'id': 'iu0hl2',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '#e6ad62',
 'link_flair_richtext': [],
 'link_flair_template_id': 'f839d0a6-0223-11e9-8aeb-0ec385a65366',
 'link_flair_text': 'Discussion',
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': F

In [13]:
# Retrieving all comments for a submission object can be done using the link id
# https://api.pushshift.io/reddit/submission/comment_ids/6uey5x

### Subreddit Info 
For connecting user-input to a group of subreddits

In [14]:
# subreddit info
# By the way, reddit.com/r/subreddit/about.json (description, subscribers, etc) and 
# reddit.com/r/subreddit/wiki/pages.json (all resources, including wikipages, sidebar
# and other stuff) are probably what you're looking for.

# Method 1) Direct request
sr_info = requests.get('http://reddit.com/r/headphones/about.json', headers = {'User-agent': 'your bot 0.1'})
sr_info_c = json.loads(sr_info.text)
print(sr_info_c['data']['public_description'])

# Method 2) Pushshift API
data_sr = []
stop_at = 10000
with open('data/reddit_subreddits.ndjson') as f:
    for i, line in enumerate(f):
        data_sr.append(json.loads(line))
        if (i>stop_at):
            break

A place for discussion, news, reviews and DIY projects related to portable audio, headphones, headphone amplifiers and DACs.


In [15]:
l = []
for d in data_sr:
    l.append(d['display_name'])

In [108]:
#l

## Get submissions from subreddit, with certain parameters

In [17]:
# Get a related submission and then all comments 
# Say we're looking for advice or recommendations and headphones

# A) Get all posts in the subreddit headphones in the last 100 days which include the words best and advice
h_page = requests.get('http://api.pushshift.io/reddit/search/submission/?subreddit=headphoneadvice&q=best+advice&before=360d')
h_data = json.loads(h_page.text)
print(len(h_data['data']))

In [19]:
# Print title of submissions
for d in h_data['data']:
    print(d['title'])

[PA] Gaming Headset Purchase advice for PC
OpAmp Rolling
[PA] EDM listener IEMs for £100 or less
Need advice: Beyerdynamic DT770, Shure SRH840 or Audio-Technica M50X?
Seeking some advice
Best over-ear headphones under 100$ with good cushioning and durability
[PA] Beyerdynamic Custom vs. DT770 Pro
Inear decision making
Beyerdynamic Custom Series and DT770?
[PA] Buying my first IEM's. 50-100$. Details inside
[PA] I would like something decent to liste to deadmau5
Headset under $100 ? (Razer Kraken Pro / Cooler Master MH752)
Best amp/DAC?
Should I buy an amp for my Dt 880s? and other questions! I appreciate any and all advice.
Best starter audiophile headphones?
[PA] Is the Audio Technica M40x a good pair of headphones for casual listening?
Headphone advice for noise cancelling for long periods
[PA] Best alternative to the Jabra Elite 65T with no master/slave connection
[PA] [CO] [please help!] Starter Looking to Get Advice on Headphones and Amps/DACs, Heavily Considering Sennheiser HD 65

In [100]:
# B) Given a submission, let's get all associated comments:
print(d)
d['id']

{'all_awardings': [], 'allow_live_comments': False, 'author': 'FrankieMcGovs', 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_text': None, 'author_flair_type': 'text', 'author_fullname': 't2_4gpk5pq0', 'author_patreon_flair': False, 'can_mod_post': False, 'contest_mode': False, 'created_utc': 1568509123, 'domain': 'self.HeadphoneAdvice', 'full_link': 'https://www.reddit.com/r/HeadphoneAdvice/comments/d4dsie/may_i_have_your_attention_please_true_wireless/', 'gildings': {}, 'id': 'd4dsie', 'is_crosspostable': True, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': True, 'is_self': True, 'is_video': False, 'link_flair_background_color': '', 'link_flair_richtext': [], 'link_flair_text_color': 'dark', 'link_flair_type': 'text', 'locked': False, 'media_only': False, 'no_follow': True, 'num_comments': 2, 'num_crossposts': 0, 'over_18': False, 'permalink': '/r/HeadphoneAdvice/comments/d4dsie/may_i_have_your_attent

'd4dsie'

In [102]:
a_text = d['selftext']

In [None]:
def get_assoc_comments(d):
    ''' Using a submission JSON, return a new JSON of all comments.'''
    c_page = requests.get('https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(d['id']))
    c_data = json.loads(c_page.text)
    
    cs_page = requests.get('https://api.pushshift.io/reddit/comment/search?ids={}'.format(",".join(c_data['data'])))
    cs_data = json.loads(cs_page.text)
    
    return(cs_data['data'])

nlp = spacy.load("en_core_web_md")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [93]:
comments = get_assoc_comments(h_data['data'][17]) #d

In [99]:
comments[0]['body']

'Maybe some galaxy buds?'

In [105]:
get_ents(a_text)

TW 50 52 383 ORG
Hifiman 319 326 386 PRODUCT
1more 355 360 383 ORG
NuForce BeFrees 386 401 380 PERSON
6 412 413 397 CARDINAL
Audio-Technica 457 471 383 ORG
150 486 489 394 MONEY
IFS 

 508 514 383 ORG
Lil 523 526 380 PERSON
Lyperchek Teva 532 546 380 PERSON
2020 614 618 397 CARDINAL
iPhone 828 834 383 ORG
11 835 837 397 CARDINAL
Max 842 845 380 PERSON
the Astrotec S80 1016 1032 386 PRODUCT
40 1049 1051 394 MONEY
the Liberty Air 1095 1110 387 EVENT
40 1206 1208 394 MONEY


In [None]:
comments = get_assoc_comments(h_data['data'][5]) #d

In [152]:
prod_orgs = get_prod_orgs(a_text)
# Clean-up
prod_orgs = [p.rstrip() for p in prod_orgs]
Counter(prod_orgs)

TW 50 52 383 ORG
Hifiman 319 326 386 PRODUCT
1more 355 360 383 ORG
NuForce BeFrees 386 401 380 PERSON
6 412 413 397 CARDINAL
Audio-Technica 457 471 383 ORG
150 486 489 394 MONEY
IFS 

 508 514 383 ORG
Lil 523 526 380 PERSON
Lyperchek Teva 532 546 380 PERSON
2020 614 618 397 CARDINAL
iPhone 828 834 383 ORG
11 835 837 397 CARDINAL
Max 842 845 380 PERSON
the Astrotec S80 1016 1032 386 PRODUCT
40 1049 1051 394 MONEY
the Liberty Air 1095 1110 387 EVENT
40 1206 1208 394 MONEY


Counter({'TW': 1,
         'Hifiman': 1,
         '1more': 1,
         'Audio-Technica': 1,
         'IFS': 1,
         'iPhone': 1,
         'the Astrotec S80': 1})

In [149]:
get_prod_orgs("This entire setup only cost me ~$100... Marantz SR5600 - $80 Klipsch Synergy F1s - $10 KLH Subwoofer - $10")

Marantz 40 47 383 ORG
80 58 60 394 MONEY
Klipsch Synergy 61 76 383 ORG
KLH Subwoofer - 87 102 383 ORG
10 104 106 394 MONEY


['Marantz', 'Klipsch Synergy', 'KLH Subwoofer -']

In [130]:
get_prod_orgs_nbors(a_text)

TW 50 52 383 ORG
Hifiman 319 326 386 PRODUCT
1more 355 360 383 ORG
NuForce BeFrees 386 401 380 PERSON
6 412 413 397 CARDINAL
Audio-Technica 457 471 383 ORG
150 486 489 394 MONEY
IFS 

 508 514 383 ORG
Lil 523 526 380 PERSON
Lyperchek Teva 532 546 380 PERSON
2020 614 618 397 CARDINAL
iPhone 828 834 383 ORG
11 835 837 397 CARDINAL
Max 842 845 380 PERSON
the Astrotec S80 1016 1032 386 PRODUCT
40 1049 1051 394 MONEY
the Liberty Air 1095 1110 387 EVENT
40 1206 1208 394 MONEY


[(upcoming, 'TW', models),
 (, 'Hifiman', TWS600),
 (, '1more', stylish),
 (, 'Audio-Technica', -),
 (at, 'IFS \n\n', 
  ),
 (new, 'iPhone', 11),
 (grabbed, 'the Astrotec S80', Astrotec)]

In [128]:
doc = nlp("Give it back! He pleaded.")
span = doc[2:3]
#assert span[1].text == "back"
print(span)
print(span.start)
print(span.end)
print(doc[span.start+1])
print(doc[span.start-1])

back
2
3
!
it


## Grab entities, product-ish

In [146]:
def get_ents(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)

        
def get_prod_orgs(text):
    
    prod_orgs = []
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)    
        if(ent.label ==383 or ent.label== 386):
            # If it's an ORG, and the next token is alphanumeric combo (SR800, S300, HD599, etc),
            # then add that to it?
            prod_orgs.append(ent.text)
    return prod_orgs  
        
        
        
def get_ents_sentsplit(text):
    doc = nlp(text)
    sentences = [sent.string.strip() for sent in doc.sents]
    
    for s in sentences:
            doc = nlp(s)
            for ent in doc.ents:
                print(ent.text, ent.start_char, ent.end_char, ent.label_)
                print([(token.idx, token, type(token), token.nbor()) for token in ent])
                
def candidate_products(text):
    doc = nlp(text)
    product_list = []
    likely_product = []
    sentences = [sent.string.strip() for sent in doc.sents]
    
    for s in sentences:
            doc = nlp(s)
            for ent in doc.ents:
                #print(ent.text, ent.start_char, ent.end_char, ent.label_)
                #print([(token.idx, token, type(token), token.nbor()) for token in ent])
                #print([(token, token.nbor()) for token in ent])
                for token in ent:
                    try:
                        product_list.extend([(token, token.nbor())])
                    except:
                        print("IndexError")
                        
                if(ent.label == 380): #Product
                    likely_product.append(ent.text)
    return product_list, likely_product

In [151]:
from collections import Counter
def score_from_prod_list(prod_list):
    # Counter({'b': 2, 'a': 1})
    # Really, should do some string manipulations to ensure we aren't double counting (lower, etc)
    return Counter(prod_list)

In [52]:
entity = doc.ents[0]
print(f"Token on the left: '{doc[entity.start - 1]}'")
print(f"Token on the right: '{doc[entity.end]}'") 

In [53]:
comments[0]['body']

"Not sure if the open air design is acceptable but the Grado SR80e  has great cushioning and outstanding balanced design.  It's kind of the gateway drug of audiophile headphones.  Sound and comfort terrific.  Isolation is poor.  If you can live with that you are in for a treat"

In [54]:
get_ents(comments[0]['body'])

Grado 54 59 380


In [64]:
product_list = candidate_products(comments[0]['body'])

Grado 54 59 PERSON
[(Grado, SR80e)]


In [63]:
comments[1]['body']

"I will disagree with the other poster. A lot of people find the Grados beloe average in comfort. That might not be you, but best to buy them from a place you can return them. For better comfort on the SR80, you might want to upgrade them to the l pads that come on the SR225.\n\nkeep in mind that with Bluetooth, you're paying extra for the electronics versus a wired set of headphones. So typically you can get better sound quality for your money by going with the best wired headphones."

In [62]:
product_list = candidate_products(comments[1]['body'])

Grados 25 31 ORG
[(Grados, beloe)]
SR80 26 30 PRODUCT
[(SR80, ,)]
SR225 94 99 PRODUCT
[(SR225, .)]
Bluetooth 23 32 ORG
[(Bluetooth, ,)]


In [74]:
agg_product_list = []
for c in comments:
    agg_product_list.extend(candidate_products(c['body']))
print(agg_product_list)

# Need to exclude if not English

IndexError
IndexError
[(2020, !), (United, are), (Deano, has), (United, just), (Presets, V2), (V2, [), (OLH, 1000109](https://support.crestron.com), (1000109](https://support.crestron.com, /), (Sonos, modules), (CRPC, port), (Heos, there), (1, month), (100, %), (%, safe), (3080, ,), (3, and), (MSI, the), (TUF, .), (fifth, time), (Finebaum, yet), (MacBook, .), (7b, and), (summer, heat), (this, year), (year, ?), (one, viewer), (lo, mejor), (mejor, te), (te, sirve), (sirve, mas), (mas, buscar), (buscar, en), (uruguay, .), (&, gt;up), (gt;up, to), (2.2, million), (million, lives), (a, nice), (nice, day), (day, ,)]


In [None]:
#for comments = get_assoc_comments(h_data['data'][5]) #d

In [88]:
#h_data here comes from 
#h_page = requests.get('http://api.pushshift.io/reddit/search/submission/?subreddit=headphoneadvice&q=best+advice&before=360d')
#h_data = json.loads(h_page.text)
[h['title'] for h in h_data['data']]

['[PA] Gaming Headset Purchase advice for PC',
 'OpAmp Rolling',
 '[PA] EDM listener IEMs for £100 or less',
 'Need advice: Beyerdynamic DT770, Shure SRH840 or Audio-Technica M50X?',
 'Seeking some advice',
 'Best over-ear headphones under 100$ with good cushioning and durability',
 '[PA] Beyerdynamic Custom vs. DT770 Pro',
 'Inear decision making',
 'Beyerdynamic Custom Series and DT770?',
 "[PA] Buying my first IEM's. 50-100$. Details inside",
 '[PA] I would like something decent to liste to deadmau5',
 'Headset under $100 ? (Razer Kraken Pro / Cooler Master MH752)',
 'Best amp/DAC?',
 'Should I buy an amp for my Dt 880s? and other questions! I appreciate any and all advice.',
 'Best starter audiophile headphones?',
 '[PA] Is the Audio Technica M40x a good pair of headphones for casual listening?',
 'Headphone advice for noise cancelling for long periods',
 '[PA] Best alternative to the Jabra Elite 65T with no master/slave connection',
 '[PA] [CO] [please help!] Starter Looking to Ge

In [165]:
def get_cmted_prods_submissions(submissions_data):
    '''This takes in the JSON data returned from a query to a specific subreddit, 
    gets all associated comments and then returns an aggregate "product-list" from
    all this comment data.'''
    all_likely_products = []
    agg_product_list = []
    total_comments = 0
    for h_d in submissions_data['data']:
        comments = get_assoc_comments(h_d) #d
        total_comments += len(comments)
        for c in comments:
            agg_product_list.extend(get_prod_orgs(c['body']))
            
    print("Comments = {}".format(total_comments))
    
    return agg_product_list

## Testing multi-subredddit product extraction

In [160]:
# Let's grab data from 3 subreddits about headphones:
# Headphoneadvice, audiophile, BudgetAudiophile
subreddits = ['headphoneadvice', 'budgetaudiophile', 'audiophile']
prod_list_from_srs = []
for sr in subreddits:
    h_page = requests.get('http://api.pushshift.io/reddit/search/submission/?subreddit={}&q=best+recommend&before=360d'.format(sr))
    h_data = json.loads(h_page.text)
    agg_prod_list = get_cmted_prods_submissions(h_data)
    
    prod_list_from_srs.extend(agg_prod_list)
    

AKG 58 61 383 ORG
Sennheiser HD6XX + Tin Audio T2 212 243 386 PRODUCT
1/2 95 98 397 CARDINAL
1/3 102 105 397 CARDINAL
hd6xx 137 142 386 PRODUCT
Fidelio 361 368 383 ORG
x2hr 369 373 386 PRODUCT
two 431 434 397 CARDINAL
Beyers 551 557 380 PERSON
Grados 562 568 380 PERSON
the Audio-Technica 766 784 383 ORG
m50x 815 819 397 CARDINAL
Bosshifi 850 858 384 GPE
NVX 968 971 386 PRODUCT
the Brainwavz HM5 989 1006 383 ORG
OP 23 25 380 PERSON
Amazon 0 6 383 ORG
JDS Labs Atom Amp 16 33 383 ORG
second 2 8 396 ORDINAL
Atom 13 17 386 PRODUCT
D10 22 25 383 ORG
Atom 63 67 388 WORK_OF_ART
DAC 141 144 383 ORG
dac 48 51 383 ORG
FPS 125 128 383 ORG
1 12 13 397 CARDINAL
349 33 36 394 MONEY
ÆON Flow Closed 80 95 383 ORG
180 61 64 394 MONEY
the Blon B20 132 144 383 ORG
Black Beauty Series 31 50 387 EVENT
Hifiman Sundara 0 15 380 PERSON
Thieaudio and Monoprice 118 141 383 ORG
Aeon Flow Closed 209 225 383 ORG
the Fostex Planars 236 254 386 PRODUCT
Sundara 33 40 380 PERSON
NAD 22 25 383 ORG
300 153 156 394 MONEY


18 95 97 397 CARDINAL
Sanus-NFC18B-Center-Channel-Black/dp/B001I70XZ6 147 194 383 ORG
VTI DF Series Center Channel Speaker Stand](https://www.amazon.com/VTI-DFC-Center-Channel-Speaker/dp/B001F2GH2M 284 395 388 WORK_OF_ART
gateway&amp;sr=8-15 463 482 386 PRODUCT
Universal TV Stand - Table Top TV Stand for 37 587 633 388 WORK_OF_ART
LCD 642 645 383 ORG
Polk 14 18 388 WORK_OF_ART
Klipsch 6 13 383 ORG
first 249 254 396 ORDINAL
Crown 0 5 383 ORG
4Ω 35 37 397 CARDINAL
2 - one 187 194 397 CARDINAL
the L &amp 199 209 383 ORG
one 217 220 397 CARDINAL
Behringer 243 252 383 ORG
the Dynavoice M65 17 34 9191306739292312949 FAC
Klipsch 0 7 383 ORG
RF-82 II 8 16 386 PRODUCT
SP- 181 184 380 PERSON
Klipsch 283 290 383 ORG
eBay 6 10 383 ORG
CDM7SE 69 75 386 PRODUCT
550 83 86 394 MONEY
600 10 13 394 MONEY
HT 256 258 383 ORG
Elac Debut 0 10 383 ORG
240 51 54 394 MONEY
Amazon.com 64 74 388 WORK_OF_ART
Apple TV 4K 9 20 383 ORG
Sony 22 26 383 ORG
about 20 feet 99 112 395 QUANTITY
HT 138 140 383 ORG
FTFY 0 4 

100% 8 12 393 PERCENT
Wharfedales 21 32 388 WORK_OF_ART
1000 1 5 394 MONEY
Rsl Speedwoofer 19 34 380 PERSON
399 36 39 394 MONEY
Denon AVR-X3500H 141 157 383 ORG
1000 36 40 394 MONEY
3500 10 14 397 CARDINAL
3 year 39 45 391 DATE
3500 10 14 397 CARDINAL
Refurbed 31 39 383 ORG
3 year 45 51 391 DATE
Refurb Denon 0 12 383 ORG
999 26 29 394 MONEY
3 year 35 41 391 DATE
7.2 0 3 397 CARDINAL
Denon avr-x3500h 👍 0 18 383 ORG
Yamaha 126 132 383 ORG
RX-A1080 133 141 386 PRODUCT
Denon 162 167 383 ORG
Denon X series 80 94 386 PRODUCT
Yamaha 98 104 383 ORG
Adventage 105 114 386 PRODUCT
Denon 153 158 383 ORG
Yamaha 209 215 383 ORG
Yamaha 20 26 383 ORG
2080 27 31 397 CARDINAL
now 32 35 391 DATE
Denon 78 83 383 ORG
Brazil 95 101 384 GPE
Denon 139 144 383 ORG
Yamaha 176 182 383 ORG
2080 183 187 397 CARDINAL
Denon 0 5 383 ORG
Marantz 10 17 383 ORG
the Audyssey MultiEQ XT32 68 93 387 EVENT
Denon 18 23 383 ORG
X3500 45 50 383 ORG
Denon 76 81 383 ORG
7.2 42 45 397 CARDINAL
about $200 174 184 394 MONEY
Marantz

2 70 71 397 CARDINAL
8 inch 96 102 395 QUANTITY
craiglist 173 182 383 ORG
200$ 506 510 394 MONEY
100$ 629 633 394 MONEY
up to 200$ 653 663 394 MONEY
AVR 788 791 383 ORG
4k 851 853 397 CARDINAL
avr 895 898 383 ORG
50$. 905 909 397 CARDINAL
avr 916 919 383 ORG
a few years old 934 949 391 DATE
350$ 1033 1037 394 MONEY
350 1094 1097 397 CARDINAL
avr 1285 1288 383 ORG
around 100-200 1577 1591 397 CARDINAL
5.1 1844 1847 397 CARDINAL
faq 1979 1982 383 ORG
today 2132 2137 391 DATE
a year 2413 2419 391 DATE
all day 2495 2502 391 DATE
2.1 2648 2651 397 CARDINAL
one 2686 2689 397 CARDINAL
two 2740 2743 397 CARDINAL
dayton 77 83 383 ORG
one year 42 50 391 DATE
DENON 65 70 383 ORG
160 212 215 394 MONEY
DENON 234 239 383 ORG
230 377 380 394 MONEY
Audyssey Multi EQ 387 404 383 ORG
HEOS 409 413 383 ORG
150 796 799 394 MONEY
5.25 812 816 397 CARDINAL
Fluance 819 826 380 PERSON
two 878 881 397 CARDINAL
100 930 933 394 MONEY
5.25 967 971 397 CARDINAL
Infinity Reference 974 992 388 WORK_OF_ART
139 1051 10

HEDD 68 72 383 ORG
third 78 83 396 ORDINAL
Berlin 115 121 384 GPE
three 190 195 397 CARDINAL
Few days late 0 13 391 DATE
a few pennies 18 31 394 MONEY
Yamaha 60 66 383 ORG
JBL 99 102 383 ORG
bass 140 144 380 PERSON
A3X 20 23 397 CARDINAL
Sub8 25 29 385 LOC
Sub10 53 58 383 ORG
one second 18 28 392 TIME
300-400€ 45 53 394 MONEY
20Hz - 40Hz 211 222 397 CARDINAL
REL 96 99 383 ORG
Rythmik 101 108 380 PERSON
Martin Logan 113 125 380 PERSON
2 87 88 397 CARDINAL
post](https://www.reddit.com/r/audiophile 163 204 383 ORG
Redbook 13 20 388 WORK_OF_ART
24/96 62 67 397 CARDINAL
1k 378 380 394 MONEY
years 86 91 391 DATE
320k 116 120 397 CARDINAL
years ago 130 139 391 DATE
Redbook 491 498 386 PRODUCT
Johnny Audiophile 530 547 380 PERSON
Massdrop 557 565 380 PERSON
150 707 710 394 MONEY
DAC 711 714 383 ORG
DAC 778 781 383 ORG
40 929 931 394 MONEY
China 958 963 384 GPE
100 1067 1070 394 MONEY
Chinese 1413 1420 381 NORP
thousands of dollars 172 192 394 MONEY
Redbook 683 690 388 WORK_OF_ART
1 0 1 397 CAR

first 716 721 396 ORDINAL
HD6XX 1000 1005 381 NORP
Monoprice 23 32 383 ORG
He4xx 107 112 383 ORG
HE4xx 212 217 383 ORG
Monoprice 261 270 383 ORG
One 378 381 397 CARDINAL
HeadphoneAdvice 3 18 383 ORG
HE4xx 51 56 383 ORG
the whole day 471 484 391 DATE
Cherry MX Blues 313 328 388 WORK_OF_ART
Cherry MX Blues 371 386 386 PRODUCT
Alps Blues 436 446 383 ORG
IBM Beam Spring 450 465 383 ORG
thousands 560 569 397 CARDINAL
thousands 617 626 397 CARDINAL
200 755 758 394 MONEY
jack 1154 1158 380 PERSON
6xx 1314 1317 386 PRODUCT
HE4xx 1368 1373 383 ORG
6xx 1816 1819 396 ORDINAL
WH1000XM3 92 101 383 ORG
California 829 839 384 GPE
US 133 135 384 GPE
Europe 153 159 385 LOC
Europe 20 26 385 LOC
US 74 76 384 GPE
Klipsch ProMedia 29 45 383 ORG
the EVGA Nu Audio 54 71 386 PRODUCT
Two 115 118 397 CARDINAL
First 128 133 396 ORDINAL
Second 169 175 396 ORDINAL
first 378 383 396 ORDINAL
Klipsch 428 435 383 ORG
R-51M 442 447 386 PRODUCT
Micca 597 602 386 PRODUCT
Second 645 651 396 ORDINAL
gt;but 1 7 383 ORG
EVGA

In [161]:
# Larger scale test:
len(prod_list_from_srs)

790

In [168]:
print(len(Counter(prod_list_from_srs).most_common()))
print(len(Counter([p.lower() for p in prod_list_from_srs]).most_common()))
print(Counter([p.lower() for p in prod_list_from_srs]).most_common())

446
414
[('dsd', 29), ('dac', 24), ('yamaha', 22), ('klipsch', 20), ('denon', 18), ('amazon', 17), ('mqa', 16), ('avr', 13), ('flac', 12), ('pcm', 11), ('svs', 9), ('apple', 9), ('marantz', 8), ('dayton', 8), ('m1060', 8), ('ebay', 7), ('bluetooth', 7), ('sennheiser', 6), ('kef', 6), ('monoprice', 6), ('tidal', 6), ('deezer', 6), ('post](https://www.reddit.com/r/audiophile', 6), ('digital', 6), ('jvc', 5), ('sony', 5), ('hdmi', 5), ('onkyo', 5), ('spotify', 5), ('amazon hd', 5), ('itunes', 5), ('anc', 4), ('emotiva', 4), ('ht', 4), ('&gt', 4), ('kbps', 4), ('usb', 4), ('he4xx', 4), ('fiio', 3), ('lakers', 3), ('k3', 3), ('elac', 3), ('lcd', 3), ('apple tv', 3), ('polk', 3), ('nvidia', 3), ('plex', 3), ('goodwill', 3), ('dayton audio', 3), ('jbl', 3), ('headphoneadvice', 3), ('airplay2', 3), ('alac', 3), ('redbook', 3), ('audyssey', 3), ('x2hr', 2), ('fps', 2), ('nad', 2), ('kuzma', 2), ('dac/amp', 2), ('k5', 2), ('fyi', 2), ('sr50bt', 2), ('elacs', 2), ('subwoofer', 2), ('crown', 2), (

In [155]:
# Test on small amount of data
len(agg_product_list)

121

In [157]:
# Clean-up
prod_orgs = [p.rstrip() for p in agg_product_list]
Counter(prod_orgs).most_common()

[('T3', 7),
 ('Amp', 4),
 ('DAC', 3),
 ('Sennheiser', 2),
 ('Fiio', 2),
 ('770s', 2),
 ('JD', 2),
 ('K3', 2),
 ('Beyerdynamic', 2),
 ('Kuz', 2),
 ('Asus', 2),
 ('the Aorus Master', 2),
 ('usb dongle', 2),
 ('Sony', 2),
 ('Mountcastle', 1),
 ('The U.S. Census', 1),
 ('OC', 1),
 ('747', 1),
 ('BBUS', 1),
 ('Bumper', 1),
 ('Mt. Baden Powell', 1),
 ('https://en.wikipedia.org/wiki/Operation_Acid_Gambit', 1),
 ('pensabas que era tu hija', 1),
 ('el', 1),
 ('Grados', 1),
 ('SR80', 1),
 ('SR225', 1),
 ('Bluetooth', 1),
 ('JBL', 1),
 ('AMP', 1),
 ('e3000', 1),
 ('Audio Technica', 1),
 ('Triple Drivers', 1),
 ('Shure SE215', 1),
 ('EDM', 1),
 ('808s', 1),
 ('E3000', 1),
 ('MMCX', 1),
 ('IEM', 1),
 ('the T2', 1),
 ('Cloud II', 1),
 ('Schitt Modi 3', 1),
 ('JDS Labs Atom Amp', 1),
 ('Dac/Amp', 1),
 ('amp+dac', 1),
 ('DT', 1),
 ('990s', 1),
 ('Khadas Tone Board', 1),
 ('the HD 58X', 1),
 ('LAN', 1),
 ('FiiO E10k Amp', 1),
 ('FiiO', 1),
 ('fiio dac', 1),
 ('The HD 58X', 1),
 ('Fiio K3', 1),
 ('K5', 

In [90]:
agg_product_list

[(Samsung, buds), (Mifo05, ), (Sony, ch700n), (Soundcore, Liberty), (2, Pro)]

In [84]:
all_likely_products

['Lollllll',
 'Ibasso IT01',
 'Takfir',
 'Jumex Mango Nectar',
 'booch',
 "Y'Shaarj",
 '# CENSORED',
 'Grado',
 'Beyertreble',
 'Beyertreble',
 'Bass',
 'JDS Atom',
 'sounblaster G6',
 'JDS',
 'JDS',
 'X2hr',
 'Planars',
 'Viktor Frankl',
 'Hitler',
 'Hitler',
 'Hitler',
 'Hitler',
 'Hitler',
 'Hitler',
 'Hitler',
 'Collins',
 'Randy',
 'Randy',
 'Chino',
 'pogba',
 'Dior Bobby',
 'Fruits Basket',
 'Schumacher',
 'Audiotechnica AT2020 USB',
 'Micca Origen',
 'Schiit Fulla/Asgard',
 'Scarlett',
 'Android',
 'xbox',
 'xbox',
 'Soundcore Liberty']

In [None]:
# the little ice age (revised) how climate made history 1300-1850 (on eBay, revised much better).

In [47]:
nlp = spacy.load("en_core_web_sm") #sm
# For each sentence
text1 = "120£ Gets you something really good. Just get the Sony MDR-1A on Amazon."
text2 = "It is definitely a step above the Skullcandy Crusher."
text_comp = "120£ Gets you something really good. Just get the Sony MDR-1A on Amazon. It is definitely a step above the Skullcandy Crusher."


text_rnd = "Get the XM3’s. They’re better in pretty much every category, except for seamless connection, but you only need to pair them once. Sony also has really good warranty service if your earbuds decide to stop working. Sony also has a really good app for customizing your sound even more which I don’t think AirPods have"

nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated
doc = nlp(text_rnd)
sentences = [sent.string.strip() for sent in doc.sents]
print(sentences)
#doc = nlp("120£ Gets you something really good. Just get the Sony MDR-1A on Amazon.")
#doc = nlp("It is definitely a step above the Skullcandy Crusher.")
for d in sentences:
    doc = nlp(d)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label)

['Get the XM3’s.', 'They’re better in pretty much every category, except for seamless connection, but you only need to pair them once.', 'Sony also has really good warranty service if your earbuds decide to stop working.', 'Sony also has a really good app for customizing your sound even more which', 'I don’t think AirPods have']
Sony 0 4 383
Sony 0 4 383
AirPods 14 21 388


In [28]:
nlp = spacy.load("en_core_web_md")
doc = nlp("Unless you're overclocking it, the stock cooler is free and will do the job (albeit not too quietly.) If you want a cheap tower cooler that can handle a bit of overclocking (or just running at stock a little quieter) the Gammax 400 is a cheaper 212 Evo equivalent, plus it lights up if you're into that.")


for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label)

212 Evo 245 252 394


In [65]:
python -m spacy download en_core_web_md

Gammax 41 47 380


In [None]:
_         " If you want a cheap tower cooler that can handle a bit of overclocking (or just running at stock a little quieter) the Gammax 400 is a cheaper Evo 212 equivalent, plus it lights up if you're into that.".upper()

In [23]:
data_sr[0]
['display_name'] # e.g. headphones

['display_name']

In [111]:
# Detecting english:
from spacy_langdetect import LanguageDetector

nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
text = 'This is an english text.'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)

ModuleNotFoundError: No module named 'spacy_langdetect'