In [16]:
import random
import pandas as pd
import numpy as np
import datetime

from math import sqrt, log, cos, sin, pi
from collections import Counter
from itertools import accumulate
from bisect import bisect_left
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def multi_char_replace(value, rep_dict):
    
    for k,v in rep_dict.items():
        value=value.replace(k,v)
    return value
    
def string_cleanse(value):
    cullchars = "()[],.'\";:<>!@£$%^&*"
    cull_d = {k:" " for k in cullchars}
    value = multi_char_replace(value, cull_d)
    return value.lower()

def string_include_list(value):
    include="0123456789"
    return "".join([c for c in value if c in include])

# Weighted random sample:
def wsample(population, weights, k=1):
    accum = list(accumulate(weights))
    total = accum[-1]
    sampl = {}
    while len(sampl) < k:
        index        = bisect_left(accum, total * random.random())
        sampl[index] = population[index]
    return list(sampl.values())

In [3]:
# Takes Ages, big file
# Possible future thing around finding SIC-grouped token probabilities.
company_names = pd.read_csv("sample/company_names.csv", encoding="latin")
company_names['_clean_name'] = company_names['CompanyName'].apply(lambda x : string_cleanse(x))
company_names['_clean_sic'] = company_names['SIC'].apply(lambda x : string_include_list(x))

#names,counts = zip(*sorted([(k,v) for k,v in Counter(" ".join([n for n in list(company_names['_clean_name'])]).lower().split()).items()], key=lambda x : x[1])[::-1])
#wsample(names, counts, k=6)



In [4]:
company_names.sample(12)

Unnamed: 0,CompanyName,IncorporationDate,SIC,_clean_name,_clean_sic
3446857,THE PRINCE ARTHUR PUBLIC HOUSE LLP,29/8/2007,,the prince arthur public house llp,
1551161,HAYE BUILD LIMITED,17/9/2008,41202.0,haye build limited,41202.0
2896676,RICHARD WHATMAN CONSULTING LTD,21/6/2017,74909.0,richard whatman consulting ltd,74909.0
1989421,LAND INVESTMENTS (UK) LTD,14/9/2006,68100.0,land investments uk ltd,68100.0
223685,AMEREX SERVICES LIMITED,31/5/1979,24100.0,amerex services limited,24100.0
3431969,THE LOUNGE CAFE LIMITED,28/9/2016,56102.0,the lounge cafe limited,56102.0
2744839,PROACTIVE COST CONSULTANCY LIMITED,28/6/2017,74902.0,proactive cost consultancy limited,74902.0
1145578,EMI MELODIES LIMITED,18/8/1980,58110.0,emi melodies limited,58110.0
1061202,DUKSBAK LIMITED,11/3/2010,82990.0,duksbak limited,82990.0
2375671,MUSWELL BOURNE CONSULTING LIMITED,9/6/2014,82990.0,muswell bourne consulting limited,82990.0


In [5]:
def t_join(vals):
    return " ".join([str(v) for v in vals])

In [6]:
sic_text=company_names.groupby("_clean_sic")["_clean_name"].apply(t_join)

In [7]:
sic_text.index

Index(['', '0111', '01110', '01120', '01130', '01140', '01150', '01160',
       '01190', '01210',
       ...
       '96040', '96090', '97000', '9800', '98000', '98100', '98200', '99000',
       '9999', '99999'],
      dtype='object', name='_clean_sic', length=1020)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
TFVer = TfidfVectorizer()

In [10]:
TFVer.fit(company_names['_clean_name'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
sic_tfidf_m = TFVer.fit_transform(sic_text.values)

In [12]:
sic_tfidf_m[33].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [13]:
sic_text.index[33]

'01629'

In [14]:
code = 1
print(sic_text.index[code])
sorted([(k,v) for k,v in Counter(sic_text[code].split()).items()], key=lambda x : x[1])[::-1][0:30]

0111


[('limited', 2),
 ('harwell', 1),
 ('farms', 1),
 ('grove', 1),
 ('farming', 1),
 ('ettington', 1)]

In [52]:
qq=np.array(sic_tfidf_m.todense())

In [29]:
vocab = sorted(TFVer.vocabulary_)

In [53]:
np.array(qq[0])

array([0., 0., 0., ..., 0., 0., 0.])

In [59]:
for e,r in enumerate(qq):
    print(sic_text.index[e])
    for t in np.argsort(r)[::-1][0:5]:
        print("\t", vocab[t])



	 llp
	 lp
	 partnership
	 limited
	 partners
0111
	 ettington
	 harwell
	 farming
	 farms
	 grove
01110
	 limited
	 farms
	 farming
	 farm
	 sons
01120
	 limited
	 ltd
	 celebspytv
	 jivanji
	 shuxing
01130
	 limited
	 ltd
	 nurseries
	 produce
	 potatoes
01140
	 limited
	 booker
	 tate
	 ltd
	 nessat
01150
	 limited
	 glpms
	 castillia
	 october1973
	 gregorys
01160
	 limited
	 ltd
	 farm
	 jackamax
	 hemptonic
01190
	 limited
	 nurseries
	 ltd
	 turf
	 nursery
01210
	 vineyard
	 limited
	 ltd
	 wines
	 vineyards
01220
	 limited
	 ltd
	 ideafruit
	 passion4africa
	 kadjebi
0123
	 flixton
	 southfield
	 agriculture
	 pig
	 farms
01230
	 limited
	 citrus
	 llwyntew
	 efti
	 kamisli
01240
	 limited
	 fruit
	 farm
	 ltd
	 orchard
01250
	 limited
	 ltd
	 farm
	 fruit
	 trees
01260
	 limited
	 cbbrickwork
	 alivu
	 lengian
	 beasam
01270
	 limited
	 ltd
	 tea
	 blackdown
	 finlay
01280
	 limited
	 ltd
	 herbs
	 botanicals
	 hemp
01290
	 limited
	 nurseries
	 ltd
	 plants
	 nursery
01300
	

	 bellwoven
	 elastics
	 felts
	 tubbs
	 limited
1760
	 beaujersey
	 limited
	 ÿstad
	 flocked
	 flockcourt
1771
	 kuit
	 ladkin
	 hosiery
	 hindley
	 ashley
1772
	 rrk
	 realisations
	 100
	 limited
	 flocaya
18110
	 limited
	 ltd
	 printing
	 media
	 mirror
18121
	 labels
	 limited
	 ltd
	 print
	 labelling
18129
	 limited
	 print
	 ltd
	 printing
	 printers
18130
	 limited
	 ltd
	 media
	 print
	 design
18140
	 limited
	 print
	 bookbinders
	 finishers
	 ltd
18201
	 limited
	 records
	 ltd
	 music
	 amplify
18202
	 limited
	 ltd
	 media
	 films
	 productions
18203
	 limited
	 ltd
	 media
	 digital
	 design
1822
	 whytex
	 diga
	 bentwood
	 limited
	 dunsford
1824
	 02516074
	 limited
	 flockfield
	 flockers
	 flocked
1910
	 tanning
	 wildloop
	 ottawa
	 litherland
	 garston
19100
	 fosbel
	 limited
	 scotash
	 rotoequip
	 coke
1920
	 shelton
	 martin
	 limited
	 ÿstad
	 flocaya
19201
	 limited
	 lubricants
	 oil
	 ltd
	 refinery
19209
	 limited
	 oil
	 ltd
	 uk
	 oils
1930
	 cornwel

	 limited
	 battery
	 batteries
	 ltd
	 power
2722
	 00717814
	 00261074
	 limited
	 ÿstad
	 flockcourt
27310
	 limited
	 ltd
	 cymtec
	 sensornet
	 voogit
27320
	 limited
	 cables
	 ltd
	 cable
	 electronics
2733
	 floform
	 limited
	 flockevents
	 flocked
	 flockcourt
27330
	 limited
	 ltd
	 wiring
	 ionix
	 blah
27400
	 lighting
	 limited
	 ltd
	 led
	 uk
2742
	 00581048
	 liquick
	 213
	 castings
	 sandwell
2744
	 ycol
	 ratcliffs
	 limited
	 mbl
	 birkett
2745
	 nottingham
	 universal
	 engineering
	 limited
	 flock2
2751
	 limited
	 beevor
	 castings
	 02610871
	 tapbarn
27510
	 limited
	 ltd
	 logicor
	 audio
	 dyson
2752
	 798279
	 cronite
	 scomark
	 limited
	 engineering
27520
	 limited
	 stoves
	 fires
	 ltd
	 solaform
27900
	 limited
	 ltd
	 electronics
	 systems
	 electrical
2811
	 limited
	 fabrications
	 engineering
	 sheet
	 metal
28110
	 limited
	 ltd
	 engineering
	 engines
	 turbines
2812
	 01903388
	 limited
	 flockevents
	 flocked
	 flockcourt
28120
	 limited
	 hyd

	 limited
	 ltd
	 utilities
	 services
	 engineering
42910
	 limited
	 ltd
	 services
	 engineering
	 water
42990
	 limited
	 ltd
	 construction
	 engineering
	 civil
43110
	 demolition
	 limited
	 ltd
	 services
	 groundworks
43120
	 limited
	 groundworks
	 ltd
	 construction
	 plant
43130
	 drilling
	 limited
	 ltd
	 services
	 geotechnical
43210
	 electrical
	 limited
	 ltd
	 services
	 contractors
43220
	 plumbing
	 heating
	 limited
	 ltd
	 services
43290
	 limited
	 ltd
	 construction
	 services
	 building
43310
	 plastering
	 limited
	 ltd
	 drylining
	 plasterers
43320
	 joinery
	 limited
	 carpentry
	 ltd
	 construction
43330
	 flooring
	 limited
	 ltd
	 tiling
	 carpets
43341
	 decorators
	 limited
	 decorating
	 ltd
	 painting
43342
	 windows
	 limited
	 glazing
	 ltd
	 glass
43390
	 ltd
	 limited
	 building
	 construction
	 services
43910
	 roofing
	 limited
	 ltd
	 cladding
	 services
43991
	 scaffolding
	 limited
	 ltd
	 scaffold
	 services
43999
	 limited
	 ltd
	 service

	 sc086299
	 limited
	 pies
	 butchers
	 ji
5134
	 shieling
	 whisky
	 scotch
	 holdings
	 the
5136
	 woodfletch
	 limited
	 flocaya
	 flockers
	 flocked
5138
	 norfish
	 shells
	 duthie
	 seafoods
	 ltd
5139
	 ht
	 row
	 front
	 foods
	 walker
5141
	 faraon
	 limited
	 flocaya
	 flocked
	 flockcourt
5142
	 footwear
	 kent
	 limited
	 ÿstad
	 floccity
5143
	 realisations
	 limited
	 flocare
	 flockchain
	 flock2
5144
	 aynsley
	 sons
	 limited
	 flocaya
	 flockcourt
5145
	 parfums
	 petits
	 les
	 london
	 ltd
5146
	 medema
	 caduceus
	 grouping
	 pharmaceuticals
	 economic
5147
	 lefray
	 chairs
	 homestyle
	 retford
	 limited
5154
	 magnets
	 uk
	 limited
	 ÿstad
	 flocash
5157
	 processors
	 hutchinson
	 stones
	 halifax
	 metal
5165
	 tools
	 machine
	 thompson
	 limited
	 flocash
5170
	 chemical
	 timber
	 limited
	 austin
	 bradford
5182
	 ecosoil
	 compaction
	 equipment
	 ltd
	 flocash
5184
	 kaopa
	 uk
	 limited
	 ÿstad
	 floc
5190
	 limited
	 gilhespie
	 hurner
	 stownhall
	 

	 limited
	 ltd
	 financial
	 pension
	 trustee
66300
	 limited
	 ltd
	 capital
	 management
	 investment
6711
	 04169970
	 consolidated
	 limited
	 futures
	 stone
68100
	 limited
	 ltd
	 properties
	 property
	 investments
68201
	 limited
	 properties
	 ltd
	 property
	 investments
68202
	 limited
	 ltd
	 properties
	 centre
	 property
68209
	 limited
	 properties
	 ltd
	 property
	 investments
68310
	 limited
	 ltd
	 property
	 estate
	 agents
68320
	 limited
	 management
	 ltd
	 property
	 company
69101
	 limited
	 legal
	 chambers
	 ltd
	 law
69102
	 limited
	 solicitors
	 legal
	 law
	 ltd
69109
	 limited
	 legal
	 ltd
	 law
	 services
69201
	 limited
	 accountancy
	 ltd
	 accounting
	 accountants
69202
	 limited
	 ltd
	 bookkeeping
	 accounting
	 accountancy
69203
	 tax
	 limited
	 ltd
	 taxation
	 services
70100
	 limited
	 holdings
	 group
	 ltd
	 uk
7011
	 limited
	 properties
	 developments
	 ltd
	 property
7012
	 limited
	 properties
	 investments
	 ltd
	 property
7020
	 li

	 limited
	 medical
	 ltd
	 services
	 healthcare
86220
	 limited
	 ltd
	 medical
	 clinic
	 services
86230
	 dental
	 limited
	 ltd
	 practice
	 care
86900
	 limited
	 ltd
	 care
	 healthcare
	 health
87100
	 care
	 limited
	 ltd
	 homes
	 home
87200
	 care
	 limited
	 ltd
	 services
	 healthcare
87300
	 care
	 limited
	 ltd
	 home
	 homes
87900
	 care
	 limited
	 ltd
	 services
	 home
88100
	 care
	 limited
	 ltd
	 social
	 services
88910
	 nursery
	 limited
	 ltd
	 childcare
	 day
88990
	 limited
	 ltd
	 social
	 care
	 work
90010
	 limited
	 ltd
	 theatre
	 productions
	 music
90020
	 limited
	 ltd
	 music
	 productions
	 events
90030
	 limited
	 ltd
	 productions
	 music
	 design
90040
	 limited
	 ltd
	 arts
	 art
	 the
91011
	 library
	 limited
	 community
	 the
	 ltd
91012
	 limited
	 archive
	 ltd
	 the
	 trust
91020
	 museum
	 trust
	 limited
	 the
	 heritage
91030
	 trust
	 limited
	 the
	 preservation
	 heritage
91040
	 limited
	 trust
	 wildlife
	 ltd
	 the
9111
	 trades
	 