In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import re
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)

import warnings
warnings.filterwarnings("ignore")



Read csv file into a dataframe

In [9]:
df = pd.read_csv("data/ebaytitles.csv")
df = df.sample(frac=0.1) # delete this line if you are brave and have many GBs of RAM
df.head()
#df.count()

Unnamed: 0,title,category_name
257551,Hotpoint RFA52S RFA52T RFAA52 Fridge Freezer T...,"Home, Furniture & DIY"
907037,wonderland NEW DVD,"DVDs, Films & TV"
100600,New Ladies Womens Designer Style Leather Tote ...,"Clothes, Shoes & Accessories"
77339,1x NGK Copper Core Spark Plug BKR6EZ (4619),Vehicle Parts & Accessories
511926,'Floral Butterflies' Fashion Case for GALAXY S...,Mobile Phones & Communication


Print out unique values of a column

In [6]:
df.category_name.unique()

array(['Vehicle Parts & Accessories', 'Home, Furniture & DIY', 'Music',
       'Clothes, Shoes & Accessories', 'Computers/Tablets & Networking',
       'Pottery, Porcelain & Glass', 'Cameras & Photography',
       'DVDs, Films & TV', 'Baby', 'Collectibles', 'Crafts',
       'Business, Office & Industrial', 'Sporting Goods',
       'Mobile Phones & Communication', 'Jewellery & Watches',
       'Toys & Games', 'Sound & Vision', 'Garden & Patio',
       'Sports Memorabilia', 'Video Games & Consoles',
       'Musical Instruments & Gear', 'Health & Beauty', 'Antiques',
       'Coins & Paper Money', 'Art', 'Everything Else',
       'Books, Comics & Magazines', 'Pet Supplies', 'Stamps',
       'Dolls & Bears', 'Consumer Electronics',
       'Cell Phones & Accessories', 'Wholesale & Job Lots',
       'Entertainment Memorabilia', 'Travel', 'Holidays & Travel',
       'Events Tickets'], dtype=object)

Split the data into train and test observations - there is a column

In [11]:
X = df.title.values
y = df.category_name.values

X_tr, X_te, y_tr, y_te = train_test_split(X, 
                                          y,
                                          test_size=0.1,
                                          random_state=0)


Exercise 
------------------

1. Count how many titles are in each category (```pandas.DataFrame.groupby```). Print out most common at the top

In [21]:
##########################
# put your solution here #
##########################
df.groupby("category_name")["title"].count().sort_values(ascending = False)

category_name
Vehicle Parts & Accessories       23158
Clothes, Shoes & Accessories      16857
Home, Furniture & DIY             12811
Computers/Tablets & Networking     6771
Jewellery & Watches                6122
Sporting Goods                     4686
Mobile Phones & Communication      3952
Crafts                             3403
Health & Beauty                    3346
Toys & Games                       2946
Business, Office & Industrial      2761
Collectibles                       2517
Sound & Vision                     1910
Music                              1346
Garden & Patio                     1067
Cameras & Photography               861
Pet Supplies                        694
Baby                                675
DVDs, Films & TV                    594
Art                                 563
Musical Instruments & Gear          446
Video Games & Consoles              435
Books, Comics & Magazines           433
Dolls & Bears                       292
Sports Memorabilia        

<a>Double click to show the solution</a>
<div class='spoiler'>

frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
print(frequencies)

</div>

Bag of words
--------------------

Different types of vectorizers:

<ul>
<li>```sklearn.feature_extraction.text.CountVectorizer``` - Counts the number of times a word appears in the text</li>
<li>```sklearn.feature_extraction.text.TfidfVectorizer``` - Weighs the words according to the importance of the word in the context of whole collection. Is the word ```the``` important if it appears in all documents?</li>
<li>```sklearn.feature_extraction.text.HashingVectorizer``` - Useful when you don't know the vocabulary upfront. Feature number is calculated as ```hash(token) % vocabulary_size```.</li>
</ul>

Exercise
-------------------
1. Use ```CountVectorizer``` / ```TfidfVectorizer``` to fit the collection of documents
2. How many unique tokens are there in text? Print some examples (ie first few hundred).
3. What methods you can use to reduce this number? 
   - Check out and experiment with the arguments: ```ngram_range```, ```min_df```. How the vocabulary size changes with each change?
   - What would you replace / delete from the text?
4. Write a custom function `clean_text` that accepts a text as input and transforms it (remove/hash numbers, delete short/long words etc.)
5. (Extra points) When would you use ```HashingVectorizer```?

In [50]:
##########################
# put your solution here #
##########################
count_vec = CountVectorizer(analyzer='word', min_df = 1, ngram_range = (2, 3))
count_vec.fit(X).vocabulary_

{'hotpoint rfa52s': 488171,
 'rfa52s rfa52t': 766232,
 'rfa52t rfaa52': 766234,
 'rfaa52 fridge': 766236,
 'fridge freezer': 415867,
 'freezer top': 415239,
 'top middle': 924224,
 'middle bottom': 606319,
 'bottom drawer': 199394,
 'drawer front': 337902,
 'front flap': 417432,
 'hotpoint rfa52s rfa52t': 488172,
 'rfa52s rfa52t rfaa52': 766233,
 'rfa52t rfaa52 fridge': 766235,
 'rfaa52 fridge freezer': 766237,
 'fridge freezer top': 415918,
 'freezer top middle': 415240,
 'top middle bottom': 924225,
 'middle bottom drawer': 606320,
 'bottom drawer front': 199396,
 'drawer front flap': 337905,
 'wonderland new': 1006347,
 'new dvd': 640222,
 'wonderland new dvd': 1006348,
 'new ladies': 641700,
 'ladies womens': 542191,
 'womens designer': 1004285,
 'designer style': 321054,
 'style leather': 881394,
 'leather tote': 552752,
 'tote shopper': 926572,
 'shopper bag': 821271,
 'bag celebrity': 152054,
 'celebrity handbag': 246609,
 'new ladies womens': 641840,
 'ladies womens designer': 

In [51]:
count_vec = CountVectorizer()
len(count_vec.fit(X).vocabulary_)

76349

In [57]:
count_vec = CountVectorizer(min_df = 50, ngram_range = (1, 1))
len(count_vec.fit(X).vocabulary_)

3012

In [52]:
tfidf = TfidfVectorizer()
tfidf.fit(X).vocabulary_

{'hotpoint': 40779,
 'rfa52s': 59074,
 'rfa52t': 59075,
 'rfaa52': 59076,
 'fridge': 36292,
 'freezer': 36247,
 'top': 68943,
 'middle': 49359,
 'bottom': 22131,
 'drawer': 30608,
 'front': 36347,
 'flap': 35475,
 'wonderland': 74053,
 'new': 51728,
 'dvd': 31164,
 'ladies': 45340,
 'womens': 74045,
 'designer': 29443,
 'style': 65872,
 'leather': 45800,
 'tote': 69030,
 'shopper': 62659,
 'bag': 19780,
 'celebrity': 24944,
 'handbag': 39383,
 '1x': 4960,
 'ngk': 51788,
 'copper': 26960,
 'core': 26990,
 'spark': 64295,
 'plug': 55542,
 'bkr6ez': 21427,
 '4619': 9745,
 'floral': 35642,
 'butterflies': 23238,
 'fashion': 34585,
 'case': 24506,
 'for': 35919,
 'galaxy': 37059,
 's4': 60581,
 'i9500': 41371,
 'slip': 63506,
 'pouch': 55933,
 'cover': 27215,
 'crummy': 27662,
 'cookie': 26908,
 'trick': 69467,
 'deluxe': 29311,
 'edition': 32021,
 'sensational': 62028,
 'street': 65715,
 'magic': 47786,
 'tricks': 69470,
 'london': 46640,
 'wall': 72902,
 'clock': 26074,
 'boxed': 22204,
 

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.vocabulary_)[:10])
    print(len(vect.vocabulary_))

vanilla
['new', 'apple', 'iphone', '6th', 'generation', 'inch', 'blue', 'gel', 'silicone', 'case']
71734
preprocessing
['new', 'apple', 'iphone', 'th', 'generation', 'inch', 'blue', 'gel', 'silicone', 'case']
39680
preprocessing + min_df=10
['new', 'apple', 'iphone', 'th', 'generation', 'inch', 'blue', 'gel', 'silicone', 'case']
7519


<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.vocabulary_)[:10])
    print(len(vect.vocabulary_))

</div>


Stemming
------------------

Linguistic normalization in which variant forms are reduced to a common form

    connection
    connections
    connective     --->   connect
    connected
    connecting
    
Usage:

    import snowballstemmer

    stemmer = snowballstemmer.stemmer('english')
    print(stemmer.stemWords("We are the world".split()))

**How to make preprocessing faster?**

There is map a function.
Below there is a comparison of three methods from the most naive to the fastest. 

Using a loop

In [59]:
%%time
preprocessed_naive = X_tr[:]
for n in range(preprocessed_naive.shape[0]):
    preprocessed_naive[n] = clean_text(preprocessed_naive[n])

CPU times: user 624 ms, sys: 0 ns, total: 624 ms
Wall time: 624 ms


Using ```map``` function 

In [60]:
%%time 
preprocessed_map = map(clean_text, X_tr)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.2 µs


Using ```pool.map``` function 

In [61]:
%%time
from multiprocessing import Pool

pool = Pool(8)
preprocessed_poolmap = pool.map(clean_text, X_tr)
pool.close()

CPU times: user 84 ms, sys: 128 ms, total: 212 ms
Wall time: 359 ms


Putting it into a pipeline
----------------------

Now that we know how to transform text data, let's put it into a pipeline.

1. Create a pipeline with `CountVectorizer`, `StandardScaler` and `SGDClassifier` as your final algorithm
    a) use alternative format for pipeline definition when you name the steps - refer to the documentation how to do this
2. Using ```sklearn.metrics.classification_report``` create a report about your classifier

In [62]:
##########################
# put your solution here #
##########################

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', GridSearchCV(SGDClassifier(), param_grid={"alpha":[0.1,0.01,0.001]}))])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

def add_padding(t):
    return t.rjust(30)

print(classification_report(list(map(add_padding, y_tr)), 
                            list(map(add_padding, preds))))


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  1.3min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  1.3min finished


                                precision    recall  f1-score   support

                           Art       0.63      0.66      0.64       504
                          Baby       0.82      0.60      0.69       587
                         Music       0.90      0.92      0.91      1223
                        Crafts       0.83      0.81      0.82      3062
                        Stamps       0.81      0.79      0.80        97
                        Travel       0.00      0.00      0.00         2
                      Antiques       0.51      0.12      0.20       156
                  Collectibles       0.76      0.66      0.71      2274
                  Pet Supplies       0.83      0.73      0.78       615
                  Toys & Games       0.84      0.76      0.80      2623
                 Dolls & Bears       0.86      0.84      0.85       259
                Garden & Patio       0.79      0.67      0.73       969
                Sound & Vision       0.75      0.71      0.73  

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', GridSearchCV(SGDClassifier(), param_grid={"alpha":[0.1,0.01,0.001]}))])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

def add_padding(t):
    return t.rjust(30)

print(classification_report(list(map(add_padding, y_tr)), 
                            list(map(add_padding, preds))))


</div>


Grid search
--------------------------

Scikit-learn has `GridSearchCV` and `RandomizedSearchCV`. Both have the same functionality and can be used to find good parameters for the models. What is great about both these classes that they are both transformers - they return an estimator so you can chain them and put in your pipeline.

**GridSearchCV** - you specify the exact values of the parameters you want to test
**RandomizedSearchCV** - you specify ranges of parameters

Exercise
----------------------

1. Use `GridSearchCV` or `RandomizedSearchCV` to find the best parameters for the models. Check at least 2 parameters.

In [None]:
##########################
# put your solution here #
##########################s

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

Grid search

Fitting 3 folds for each of 16 candidates, totalling 48 fits


<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

</div>


Useful materials

1. http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
2. http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html