In [8]:
!wget https://www.dropbox.com/s/6mfe1c4tjqjn8d4/preprocessed_data.pkl?dl=1
!mv preprocessed_data.pkl?dl=1 preprocessed_data.pkl
!ls -lh


Redirecting output to ‘wget-log.1’.
total 71M
-rw-r--r-- 1 root root  71M Sep 20 10:57 preprocessed_data.pkl
drwxr-xr-x 2 root root 4.0K Sep 13 17:28 sample_data
-rw-r--r-- 1 root root 2.5K Sep 20 10:27 wget-log
-rw-r--r-- 1 root root 2.7K Sep 20 10:57 wget-log.1


In [0]:
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.read_pickle("./preprocessed_data.pkl")
N=10

In [0]:
def mostCorrelatedNgrams(series):
  
  tfidf = TfidfVectorizer(min_df = 20, max_df=0.5, ngram_range=(1, 2), max_features=3000)
  features  = tfidf.fit_transform(series).toarray()

  for industry in sorted(df['industry'].unique()):
      features_chi2 = chi2(features, df['industry'] == industry)
      indices = np.argsort(features_chi2[0])
      feature_names = np.array(tfidf.get_feature_names())[indices]
      unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
      bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
      print("# '{}':".format(industry))
      print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
      print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [11]:
mostCorrelatedNgrams(df['description_cleaned'])

# 'Accounting':
  . Most correlated unigrams:
       . busi
       . financi
       . firm
       . charter
       . fiscal
       . payrol
       . audit
       . bookkeep
       . tax
       . account
  . Most correlated bigrams:
       . medium size
       . profession servic
       . size busi
       . busi plan
       . advisori servic
       . financi plan
       . individu busi
       . busi owner
       . busi consult
       . small busi
# 'Apparel & Fashion':
  . Most correlated unigrams:
       . style
       . women
       . accessori
       . shoe
       . wear
       . shirt
       . garment
       . cloth
       . fashion
       . apparel
  . Most correlated bigrams:
       . custom base
       . real estat
       . los angel
       . year ago
       . make differ
       . hong kong
       . york citi
       . qualiti product
       . new york
       . high end
# 'Architecture & Planning':
  . Most correlated unigrams:
       . obra
       . ontwerp
       . plan
       .

In [12]:
mostCorrelatedNgrams(df['html_cleaned'])

# 'Accounting':
  . Most correlated unigrams:
       . calcul
       . firm
       . charter
       . busi
       . ir
       . financi
       . audit
       . payrol
       . account
       . tax
  . Most correlated bigrams:
       . non profit
       . help busi
       . profession servic
       . free consult
       . financi servic
       . servic client
       . busi solut
       . servic busi
       . busi owner
       . small busi
# 'Apparel & Fashion':
  . Most correlated unigrams:
       . shop
       . wear
       . men
       . tee
       . bag
       . cloth
       . apparel
       . dress
       . fashion
       . shirt
  . Most correlated bigrams:
       . right reserv
       . javascript enabl
       . ltd right
       . mail list
       . creat account
       . use cooki
       . custom servic
       . add cart
       . shop cart
       . quick view
# 'Architecture & Planning':
  . Most correlated unigrams:
       . ontwerp
       . obra
       . project
       . reside

In [13]:
mostCorrelatedNgrams(df['description_cleaned'] + ' ' + df['html_cleaned'])

# 'Accounting':
  . Most correlated unigrams:
       . advisori
       . firm
       . busi
       . charter
       . ir
       . financi
       . audit
       . payrol
       . account
       . tax
  . Most correlated bigrams:
       . person servic
       . financi servic
       . busi solut
       . servic client
       . rang servic
       . small medium
       . profession servic
       . servic busi
       . busi owner
       . small busi
# 'Apparel & Fashion':
  . Most correlated unigrams:
       . shoe
       . women
       . bag
       . tee
       . wear
       . dress
       . cloth
       . shirt
       . apparel
       . fashion
  . Most correlated bigrams:
       . social media
       . web design
       . qualiti product
       . year ago
       . javascript enabl
       . custom servic
       . real estat
       . new york
       . add cart
       . shop cart
# 'Architecture & Planning':
  . Most correlated unigrams:
       . ontwerp
       . obra
       . residenti
   