Sometimes your input data can be nested with more difficult structure than a simple table or a matrix.

In such cases it is sometime useful to shift mental orientation to analyze and extract information froms rows rather then non-defined columns.

In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import json
import numpy as np
import pprint
from nltk import download, word_tokenize

download('punkt')

[nltk_data] Downloading package punkt to /home/greg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
with open("data/companies/companies.json") as dataf:
    data = [json.loads(line) for line in dataf]

An example of deeply nested data with various data types:

Exercise:

1. Name variable types
2. What do you do with lists, geo location?
3. What do you do with counts?

In [4]:
pprint.pprint(data[6])

{'description': "Here at 30 Day Labs we test mobile technologies so you don't "
                'have to! We constantly test and implement the latest '
                'strategies and measure the performance so you can make sure '
                'what you implement in your Apps will get you the results you '
                'want.',
 'domain': '30daylabs.com',
 'extension': {'address': 'Sydney Area, Australia',
               'geo_location': {'country': 'Australia',
                                'formatted_address': 'Sydney, Australia',
                                'location': [-33.8907897, 151.1896257],
                                'raw': 'Sydney Area, Australia',
                                'timezone': 'Australia/Sydney'},
               'geo_location_triple': {'administrative_area': 'New South Wales',
                                       'country': 'Australia'},
               'industries': [{'count': 1, 'industry': 'Computer Software'}],
               'job_positions

With such data you can be sure that you'll need a sparse matrix.

Remember `DictVectorizer` class? It accepts a dictionary and returns a sparse matrix.

So the only thing we need is a function such that 

```f(Json) => Dict```

First we need a function `deep_select` to retrieve nested values

In [5]:
### write your deep_select function here

def deep_select(initial_path, default=None):
    def helper(row, path=initial_path):
        if len(path) == 1:
            if path[0] in row:
                return row[path[0]]
            else:
                return default
        else:
            if path[0] in row:
                return helper(row[path[0]],path[1:])
            else:
                return default
    return helper
    
jsondata = {'a': 
    {'b': 
         {'c': 1}
    }
}

# tests
assert deep_select(['a','b','c'])(jsondata) == 1
assert deep_select(['a','b'])(jsondata) == {'c': 1}
assert deep_select(['x'])(jsondata) == None

Click here to see the deep_select solution
<div class="spoiler">

def deep_select(initial_path, default=None):
    def helper(row, path=initial_path):
        if len(path) == 1:
            return row.get(path[0]) or default
        elif path[0] in row:
            return helper(row[path[0]], path[1:])
        else:
            return default
    return helper

</a>

In [6]:
def create_technologies_features(row):
    features = {}
    for tech in deep_select(['technologies'],[])(row):
        features[tech] = 1
    return features

create_technologies_features(data[1])

{'Apple Mobile Web Clips Icon': 1,
 'Canonical Content Tag': 1,
 'Cascading Style Sheets': 1,
 'Conditional Comments': 1,
 'Contact Form 7': 1,
 'FlexSlider': 1,
 'Font Awesome': 1,
 'Friends Network': 1,
 'Google Analytics': 1,
 'Google Font API': 1,
 'Google Universal Analytics': 1,
 'HTML 5 Specific Tags': 1,
 'HTML5 DocType': 1,
 'JSON-LD': 1,
 'Javascript': 1,
 'Live Writer Support': 1,
 'Meta Description': 1,
 'Nivo Slider': 1,
 'Open Graph Protocol': 1,
 'PHP': 1,
 'Pingback Support': 1,
 'RSS': 1,
 'Really Simple Discovery': 1,
 'Sitelinks Search Box': 1,
 'Slider Revolution': 1,
 'ThemePunch': 1,
 'UTF-8': 1,
 'Viewport Meta': 1,
 'Visual Composer': 1,
 'Windows 8 Pinning': 1,
 'WordPress': 1,
 'Wordpress 4.1': 1,
 'Wordpress Plugins': 1,
 'Yoast Google Analytics for WordPress': 1,
 'Yoast Plugins': 1,
 'html5shiv': 1,
 'jQuery': 1,
 'jQuery Form': 1,
 'jQuery Waypoints': 1,
 'jQuery prettyPhoto': 1,
 'nginx': 1,
 'nginx 1.8': 1}

So far so good what about text?

In [7]:
def create_description_features(row):
    features = {}
    for word in word_tokenize(row['description']):
        features["description=" + word.lower()] = 1
    return features

create_description_features(data[0])

{'description=,': 1,
 'description=.': 1,
 'description=10': 1,
 'description=and': 1,
 'description=any': 1,
 'description=apart': 1,
 'description=as': 1,
 'description=aucklands': 1,
 'description=best': 1,
 'description=blasting': 1,
 'description=building': 1,
 'description=chemical': 1,
 'description=cleaning': 1,
 'description=cleans': 1,
 'description=commitment': 1,
 'description=companies': 1,
 'description=company': 1,
 'description=complex': 1,
 'description=customer': 1,
 'description=decks': 1,
 'description=driveways': 1,
 'description=equipment': 1,
 'description=experience': 1,
 'description=exterior': 1,
 'description=for': 1,
 'description=from': 1,
 'description=have': 1,
 'description=hitting': 1,
 'description=home': 1,
 'description=homes': 1,
 'description=house': 1,
 'description=houses': 1,
 'description=in': 1,
 'description=industry': 1,
 'description=market': 1,
 'description=of': 1,
 'description=or': 1,
 'description=our': 1,
 'description=over': 1,
 'des

Let's create a more generic way to transform text

In [8]:
class TransformText():
    def __init__(self, field, tokenizer = word_tokenize):
        self.field = field
        self.tokenizer = tokenizer
    
    def __call__(self, row):
        features = {}
        for word in self.tokenizer(deep_select(self.field,"")(row)):
            word = word.lower()
            features[word] = 1
        return features
    
text_transformer = TransformText(['description'], tokenizer = lambda x:[x] )
text_transformer(data[6])

{"here at 30 day labs we test mobile technologies so you don't have to! we constantly test and implement the latest strategies and measure the performance so you can make sure what you implement in your apps will get you the results you want.": 1}

In [9]:
deep_select(['extension','geo_location','location'])(data[6])


[-33.8907897, 151.1896257]

Exercise:
-------------
    
1. Write function or classes that transform other features? You'll need a function to retrieve nested values. 
2. There are some fields which you can treat as a categorical feature or a text features. What is best and why?
3. Write a function / class that will accept a list of transforming functions and creates a concatenation of the features
4. Wrap previous function in a scikit-learn transformer class so we can use it in a pipeline

In [10]:
def combine_features(fs):

    def helper(row):
        all_features = {}
        for name, f in fs:
            for k,v in f(row).items():
                all_features[name + "_" + k] = v
        return dict(all_features)

    return helper

company_data = {
    'description': 'Fortune 500 hundred company',
    'technologies': ['sql server', 'c#']
}


features_generator = combine_features([('description', TransformText(['description'])), 
                                       ('technologies', create_technologies_features)]) 

features_generator(company_data)

{'description_500': 1,
 'description_company': 1,
 'description_fortune': 1,
 'description_hundred': 1,
 'technologies_c#': 1,
 'technologies_sql server': 1}

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

class JsonTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, fs):
        self.fs = fs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        out = []
        for x in X:
            out.append(combine_features(self.fs)(x))
        return out
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer()
)

pipeline.fit_transform([company_data, company_data])

<2x6 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [12]:
X = pipeline.fit_transform(data)
X.shape

(10000, 85141)

Number of features is really high. We need to reduce this. We can remove too sparse values.
To check the sparsity of the data we can use a method

X.getnnz (number of non-zero values)

In [13]:
X.getnnz(0)

array([854, 104,  90, ..., 514,   1,  42])

Exercise
===============

1. Write a transformation class called SparsityFilter that accepts a minimum frequency. Watch out for fit function - this class has some state that you must save

```
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        ???
        return self

    def transform(self, X):
        return ???
```

In [14]:
# write sparsity function here
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.nnz = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:,self.nnz >= self.min_nnz]


Double click to see the solution 

<div class="spoiler">

class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]
</div>

In [15]:
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=250)
)

X = pipeline.fit_transform(data)
X.shape

(10000, 142)

You should see a shape that is much smaller than the original

Exercise
================

1. Build a model - try to predict the industry.
2. Evaluate its results using cross validation - what would be the best measure for this problem?

In [48]:
# write solution here
from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict

def makelist(x):
    return [x]


pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('specialities',TransformText(['specialities'], makelist)),
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=5),
    RandomForestClassifier(n_jobs=-1,n_estimators=100)
    #MultinomialNB()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y,n_jobs=-1)



Click to see the solution

<div class="spoiler">

from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_predict

pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    XGBClassifier()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

</div>

In [49]:
from sklearn.metrics import classification_report, accuracy_score


print(accuracy_score(y,predictions))

print(classification_report(y, 
                            predictions))

0.3186
                                        precision    recall  f1-score   support

                                             0.63      0.92      0.75       186
                            Accounting       0.62      0.48      0.54       106
                     Airlines/Aviation       0.25      0.05      0.09        37
        Alternative Dispute Resolution       0.00      0.00      0.00         8
                  Alternative Medicine       0.00      0.00      0.00         7
                             Animation       0.00      0.00      0.00        11
                     Apparel & Fashion       0.36      0.25      0.30        95
               Architecture & Planning       0.46      0.38      0.42       132
                          Architektura       0.00      0.00      0.00         2
                       Arts and Crafts       0.00      0.00      0.00        34
                            Automotive       0.51      0.65      0.57       147
                  Aviation & Aer

  'precision', 'predicted', average, warn_for)
