Sometimes your input data can be nested with more difficult structure than a simple table or a matrix.

In such cases it is sometime useful to shift mental orientation to analyze and extract information froms rows rather then non-defined columns.

In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import json
import numpy as np
import pprint
from nltk import download, word_tokenize

download('punkt')

[nltk_data] Downloading package punkt to /home/ramya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
with open("data/companies/companies.json") as dataf:
    data = [json.loads(line) for line in dataf]

An example of deeply nested data with various data types:

Exercise:

1. Name variable types
2. What do you do with lists, geo location?
3. What do you do with counts?

In [4]:
pprint.pprint(data[6])

{'description': "Here at 30 Day Labs we test mobile technologies so you don't "
                'have to! We constantly test and implement the latest '
                'strategies and measure the performance so you can make sure '
                'what you implement in your Apps will get you the results you '
                'want.',
 'domain': '30daylabs.com',
 'extension': {'address': 'Sydney Area, Australia',
               'geo_location': {'country': 'Australia',
                                'formatted_address': 'Sydney, Australia',
                                'location': [-33.8907897, 151.1896257],
                                'raw': 'Sydney Area, Australia',
                                'timezone': 'Australia/Sydney'},
               'geo_location_triple': {'administrative_area': 'New South Wales',
                                       'country': 'Australia'},
               'industries': [{'count': 1, 'industry': 'Computer Software'}],
               'job_positions

With such data you can be sure that you'll need a sparse matrix.

Remember `DictVectorizer` class? It accepts a dictionary and returns a sparse matrix.

So the only thing we need is a function such that 

```f(Json) => Dict```

First we need a function `deep_select` to retrieve nested values

In [5]:
### write your deep_select function here

def deep_select(initial_path, default=None):
    def helper(row, path=initial_path):
        if len(path) == 1:
            return row.get(path[0]) or default
        elif path[0] in row:
            return helper(row[path[0]], path[1:])
        else:
            return default
    return helper
    
jsondata = {'a': 
    {'b': 
         {'c': 1}
    }
}

# tests
assert deep_select(['a','b','c'])(jsondata) == 1
assert deep_select(['a','b'])(jsondata) == {'c': 1}
assert deep_select(['x'])(jsondata) == None

Click here to see the deep_select solution
<div class="spoiler">

def deep_select(initial_path, default=None):
    def helper(row, path=initial_path):
        if len(path) == 1:
            return row.get(path[0]) or default
        elif path[0] in row:
            return helper(row[path[0]], path[1:])
        else:
            return default
    return helper

</a>

In [6]:
def create_technologies_features(row):
    features = {}
    for tech in deep_select(['technologies'],[])(row):
        features[tech.lower()] = 1
    return features

create_technologies_features(data[1])

{'apple mobile web clips icon': 1,
 'canonical content tag': 1,
 'cascading style sheets': 1,
 'conditional comments': 1,
 'contact form 7': 1,
 'flexslider': 1,
 'font awesome': 1,
 'friends network': 1,
 'google analytics': 1,
 'google font api': 1,
 'google universal analytics': 1,
 'html 5 specific tags': 1,
 'html5 doctype': 1,
 'html5shiv': 1,
 'javascript': 1,
 'jquery': 1,
 'jquery form': 1,
 'jquery prettyphoto': 1,
 'jquery waypoints': 1,
 'json-ld': 1,
 'live writer support': 1,
 'meta description': 1,
 'nginx': 1,
 'nginx 1.8': 1,
 'nivo slider': 1,
 'open graph protocol': 1,
 'php': 1,
 'pingback support': 1,
 'really simple discovery': 1,
 'rss': 1,
 'sitelinks search box': 1,
 'slider revolution': 1,
 'themepunch': 1,
 'utf-8': 1,
 'viewport meta': 1,
 'visual composer': 1,
 'windows 8 pinning': 1,
 'wordpress': 1,
 'wordpress 4.1': 1,
 'wordpress plugins': 1,
 'yoast google analytics for wordpress': 1,
 'yoast plugins': 1}

So far so good what about text?

In [7]:
def create_description_features(row):
    features = {}
    for word in word_tokenize(row['description']):
        features["description=" + word.lower()] = 1
    return features

create_description_features(data[0])

{'description=,': 1,
 'description=.': 1,
 'description=10': 1,
 'description=and': 1,
 'description=any': 1,
 'description=apart': 1,
 'description=as': 1,
 'description=aucklands': 1,
 'description=best': 1,
 'description=blasting': 1,
 'description=building': 1,
 'description=chemical': 1,
 'description=cleaning': 1,
 'description=cleans': 1,
 'description=commitment': 1,
 'description=companies': 1,
 'description=company': 1,
 'description=complex': 1,
 'description=customer': 1,
 'description=decks': 1,
 'description=driveways': 1,
 'description=equipment': 1,
 'description=experience': 1,
 'description=exterior': 1,
 'description=for': 1,
 'description=from': 1,
 'description=have': 1,
 'description=hitting': 1,
 'description=home': 1,
 'description=homes': 1,
 'description=house': 1,
 'description=houses': 1,
 'description=in': 1,
 'description=industry': 1,
 'description=market': 1,
 'description=of': 1,
 'description=or': 1,
 'description=our': 1,
 'description=over': 1,
 'des

Let's create a more generic way to transform text

In [8]:
class TransformText():
    def __init__(self, field):
        self.field = field
    
    def __call__(self, row):
        features = {}
        for word in word_tokenize(deep_select(self.field,"")(row)):
            word = word.lower()
            features[word] = 1
        return features
    
text_transformer = TransformText(['description'])
text_transformer(data[6])

{'!': 1,
 '.': 1,
 '30': 1,
 'and': 1,
 'apps': 1,
 'at': 1,
 'can': 1,
 'constantly': 1,
 'day': 1,
 'do': 1,
 'get': 1,
 'have': 1,
 'here': 1,
 'implement': 1,
 'in': 1,
 'labs': 1,
 'latest': 1,
 'make': 1,
 'measure': 1,
 'mobile': 1,
 "n't": 1,
 'performance': 1,
 'results': 1,
 'so': 1,
 'strategies': 1,
 'sure': 1,
 'technologies': 1,
 'test': 1,
 'the': 1,
 'to': 1,
 'want': 1,
 'we': 1,
 'what': 1,
 'will': 1,
 'you': 1,
 'your': 1}

In [9]:
class TransformText():
    def __init__(self, field, tokenizer = word_tokenize):
        self.field = field
    
    def __call__(self, row):
        features = {}
        for word in word_tokenize(deep_select(self.field,"")(row)):
            word = word.lower()
            features[word] = 1
        return features
    
text_transformer = TransformText(['extension', "address"], tokenizer = lambda x: [x])
text_transformer(data[6])

{',': 1, 'area': 1, 'australia': 1, 'sydney': 1}

Exercise:
-------------
    
1. Write function or classes that transform other features? You'll need a function to retrieve nested values. 
2. There are some fields which you can treat as a categorical feature or a text features. What is best and why?
3. Write a function / class that will accept a list of transforming functions and creates a concatenation of the features
4. Wrap previous function in a scikit-learn transformer class so we can use it in a pipeline

In [10]:
deep_select(["extension"])(data[0])

{'address': 'Auckland, New Zealand',
 'geo_location': {'country': 'New Zealand',
  'formatted_address': '1010 Auckland, New Zealand',
  'location': [-36.8534665, 174.7655514],
  'raw': 'Auckland, New Zealand',
  'timezone': 'Pacific/Auckland'},
 'geo_location_triple': {'administrative_area': 'Auckland',
  'city': 'Auckland',
  'country': 'New Zealand'},
 'industries': [{'count': 1, 'industry': 'Real Estate'}],
 'job_positions': [{'count': 1,
   'job_position': 'Owner/Director at 0800 PRO WASH'}],
 'name': '0800 PRO WASH',
 'persons_in_database': 1,
 'skills': [{'count': 1, 'skill': 'Customer Service'},
  {'count': 1, 'skill': 'Leadership'},
  {'count': 1, 'skill': 'Training'},
  {'count': 1, 'skill': 'Business Planning'},
  {'count': 1, 'skill': 'Sales'},
  {'count': 1, 'skill': 'Team Building'},
  {'count': 1, 'skill': 'Management'},
  {'count': 1, 'skill': 'Account Management'},
  {'count': 1, 'skill': 'Marketing'},
  {'count': 1, 'skill': 'Business Development'},
  {'count': 1, 'ski

In [11]:
def create_extensions_features(row):
    features = {}
    for skill in deep_select(['extension', 'skills'],[])(row):
        features[skill.get("skill")] = skill.get("count")
    return features

create_extensions_features(data[2])

{'3D': 1,
 'Account Management': 3,
 'Accounts Payable': 1,
 'Accounts Receivable': 1,
 'Active Directory': 1,
 'Adobe Creative Suite': 1,
 'Advertising': 2,
 'After Effects': 1,
 'Architecture': 1,
 'Avid': 1,
 'Avid Media Composer': 1,
 'B2B': 1,
 'Banking': 2,
 'Blackberry': 1,
 'Branch Management': 1,
 'Brand Awareness': 1,
 'Brand Development': 1,
 'Broadcast': 1,
 'Broadcast Journalism': 1,
 'Broadcast Television': 1,
 'Budgets': 1,
 'Business Development': 2,
 'Business Strategy': 1,
 'CRM': 1,
 'Camera': 1,
 'Cloud Computing': 1,
 'Collections': 1,
 'Commercials': 1,
 'Communication': 1,
 'Construction Loans': 3,
 'Consulting': 1,
 'Consumer Lending': 2,
 'Content Strategy': 1,
 'Contract Negotiation': 3,
 'Corporate Branding': 1,
 'Creative Direction': 2,
 'Creative Strategy': 1,
 'Credit': 7,
 'Credit Analysis': 3,
 'Cross-functional Team Leadership': 1,
 'Customer Acquisition': 1,
 'Customer Retention': 2,
 'Customer Satisfaction': 1,
 'Customer Service': 10,
 'Data Center':

In [12]:
def create_skills(row):
    features = {}
    for skill in deep_select(['extension','skills'],[])(row):
        features[skill.get("skill")] = skill.get("count")
    return features

In [24]:
def create_industries(row):
    features = {}
    for industry in deep_select(['extension','industries'],[])(row):
        features[industry.get("industry")] = industry.get("count")
    return features

In [25]:
def combine_features(fs):

    def helper(row):
        all_features = {}
        for name, f in fs:
            for k,v in f(row).items():
                all_features[name + "_" + k] = v
        return all_features

    return helper

company_data = {
    'description': 'Fortune 500 hundred company company',
    'technologies': ['sql server', 'c#']
}


features_generator = combine_features([('description', TransformText(['description'])), 
                                       ('technologies', create_technologies_features),
                                      ('skills', create_skills),
                                      ("industries", create_industries)]) 

features_generator(data[0])

{'description_,': 1,
 'description_.': 1,
 'description_10': 1,
 'description_and': 1,
 'description_any': 1,
 'description_apart': 1,
 'description_as': 1,
 'description_aucklands': 1,
 'description_best': 1,
 'description_blasting': 1,
 'description_building': 1,
 'description_chemical': 1,
 'description_cleaning': 1,
 'description_cleans': 1,
 'description_commitment': 1,
 'description_companies': 1,
 'description_company': 1,
 'description_complex': 1,
 'description_customer': 1,
 'description_decks': 1,
 'description_driveways': 1,
 'description_equipment': 1,
 'description_experience': 1,
 'description_exterior': 1,
 'description_for': 1,
 'description_from': 1,
 'description_have': 1,
 'description_hitting': 1,
 'description_home': 1,
 'description_homes': 1,
 'description_house': 1,
 'description_houses': 1,
 'description_in': 1,
 'description_industry': 1,
 'description_market': 1,
 'description_of': 1,
 'description_or': 1,
 'description_our': 1,
 'description_over': 1,
 'des

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

class JsonTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, fs):
        self.fs = fs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        out = []
        for x in X:
            out.append(combine_features(self.fs)(x))
        return out
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('technologies', create_technologies_features),
                      #("industries", create_industries),
                    ('skills', create_skills)]),
    DictVectorizer()
)

pipeline.fit_transform(data)

<10000x178381 sparse matrix of type '<class 'numpy.float64'>'
	with 1467091 stored elements in Compressed Sparse Row format>

In [15]:
X = pipeline.fit_transform(data)
X.shape

(10000, 178381)

Number of features is really high. We need to reduce this. We can remove too sparse values.
To check the sparsity of the data we can use a method

X.getnnz (number of non-zero values)

In [16]:
X.getnnz(0)

array([854, 104,  90, ...,   1,   8,  66])

Exercise
===============

1. Write a transformation class called SparsityFilter that accepts a minimum frequency. Watch out for fit function - this class has some state that you must save

```
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        ???
        return self

    def transform(self, X):
        return ???
```

In [27]:
# write sparsity function here
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]

Double click to see the solution 

<div class="spoiler">

class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]
</div>

In [28]:
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features),
                      ("industries", create_industries),
                     ('skills', create_skills)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25)
)

X = pipeline.fit_transform(data)
X.shape

(10000, 4235)

You should see a shape that is much smaller than the original

Exercise
================

1. Build a model - try to predict the industry.
2. Evaluate its results using cross validation - what would be the best measure for this problem?

In [31]:
# write solution here
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_predict

pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('technologies', create_technologies_features),
                      #("industries", create_industries),
                     ('skills', create_skills)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    XGBClassifier()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))




Accuracy = 0.3871


Click to see the solution

<div class="spoiler">

from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_predict

pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    XGBClassifier()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

</div>