In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [3]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)

import warnings
warnings.filterwarnings("ignore")

Read csv file into a dataframe

In [4]:
df = pd.read_csv("data/ebaytitles.csv")
df = df.sample(frac=0.1) # delete this line if you are brave and have many GBs of RAM
df.head()

Unnamed: 0,title,category_name
170255,Maison Martin Margiela fabric high necklace wi...,Jewellery & Watches
440097,PERSONALISED Wedding Engagement Anniversary Pa...,Crafts
412988,Tax Disc Holder Tube suitable for Triumph Adve...,Vehicle Parts & Accessories
739040,Aromatherapy Hand & Nail Cream Essential Oil 6...,Health & Beauty
211177,New Genuine BATTERY EB425161LU 1500mAh FOR SAM...,Mobile Phones & Communication


Print out unique values of a column

In [5]:
df.category_name.unique()

array(['Jewellery & Watches', 'Crafts', 'Vehicle Parts & Accessories',
       'Health & Beauty', 'Mobile Phones & Communication',
       'Clothes, Shoes & Accessories', 'Home, Furniture & DIY',
       'Sporting Goods', 'Pet Supplies', 'Computers/Tablets & Networking',
       'Art', 'Toys & Games', 'Business, Office & Industrial',
       'Books, Comics & Magazines', 'Music', 'DVDs, Films & TV',
       'Cameras & Photography', 'Sports Memorabilia', 'Sound & Vision',
       'Collectibles', 'Musical Instruments & Gear', 'Baby',
       'Consumer Electronics', 'Garden & Patio', 'Dolls & Bears',
       'Antiques', 'Stamps', 'Cell Phones & Accessories',
       'Everything Else', 'Coins & Paper Money', 'Video Games & Consoles',
       'Pottery, Porcelain & Glass', 'Wholesale & Job Lots',
       'Entertainment Memorabilia', 'Travel', 'Holidays & Travel'], dtype=object)

Split the data into train and test observations - there is a column

In [6]:
X = df.title.values
y = df.category_name.values

X_tr, X_te, y_tr, y_te = train_test_split(X, 
                                          y,
                                          test_size=0.1,
                                          random_state=0)

Exercise 
------------------

1. Count how many titles are in each category (```pandas.DataFrame.groupby```). Print out most common at the top

In [17]:
##########################
# put your solution here #
##########################

grouped = df.groupby(by="category_name")['title'].count()
grouped = grouped.sort_values(ascending=False)
grouped

category_name
Vehicle Parts & Accessories       23412
Clothes, Shoes & Accessories      16819
Home, Furniture & DIY             12812
Computers/Tablets & Networking     6755
Jewellery & Watches                6216
Sporting Goods                     4688
Mobile Phones & Communication      3858
Crafts                             3371
Health & Beauty                    3292
Toys & Games                       2946
Business, Office & Industrial      2839
Collectibles                       2338
Sound & Vision                     1902
Music                              1324
Garden & Patio                     1031
Cameras & Photography               904
Baby                                700
Pet Supplies                        633
Art                                 598
DVDs, Films & TV                    586
Video Games & Consoles              486
Books, Comics & Magazines           463
Musical Instruments & Gear          425
Dolls & Bears                       281
Sports Memorabilia        

<a>Double click to show the solution</a>
<div class='spoiler'>

frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
print(frequencies)

</div>

Bag of words
--------------------

Different types of vectorizers:

<ul>
<li>```sklearn.feature_extraction.text.CountVectorizer``` - Counts the number of times a word appears in the text</li>
<li>```sklearn.feature_extraction.text.TfidfVectorizer``` - Weighs the words according to the importance of the word in the context of whole collection. Is the word ```the``` important if it appears in all documents?</li>
<li>```sklearn.feature_extraction.text.HashingVectorizer``` - Useful when you don't know the vocabulary upfront. Feature number is calculated as ```hash(token) % vocabulary_size```.</li>
</ul>

Exercise
-------------------
1. Use ```CountVectorizer``` / ```TfidfVectorizer``` to fit the collection of documents
2. How many unique tokens are there in text? Print some examples (ie first few hundred).
3. What methods you can use to reduce this number? 
   - Check out and experiment with the arguments: ```ngram_range```, ```min_df```. How the vocabulary size changes with each change?
   - What would you replace / delete from the text?
4. Write a custom function `clean_text` that accepts a text as input and transforms it (remove/hash numbers, delete short/long words etc.)
5. (Extra points) When would you use ```HashingVectorizer```?

In [24]:
##########################
# put your solution here #
##########################

cv = CountVectorizer()
cv.fit(X_tr)
print(list(cv.vocabulary_)[:50])
print(len(cv.vocabulary_))

tfidf = TfidfVectorizer()
tfidf.fit(X_tr)
print(list(tfidf.vocabulary_)[:50])
print(len(tfidf.vocabulary_))

['coco', 'l9338', 'ladies', 'black', 'flat', 'knee', 'high', 'pull', 'on', 'boots', 'sale', 'cisco', 'male', 'db', '60', 'to', '15', 'meter', '10', 'cable', '72', '0789', '01', 'cab', 'x21mt', 'hp', '650', 'usb', 'board', '01016yy00', '600', 'inspector', 'parker', 'betrapped', 'bundle', 'pc', 'cd', 'brand', 'new', 'sealed', 'oem', 'box', 'hotpoint', 'wmpg762guk', 'washing', 'machine', 'drum', 'paddle', 'hole', 'rug']
71706
['coco', 'l9338', 'ladies', 'black', 'flat', 'knee', 'high', 'pull', 'on', 'boots', 'sale', 'cisco', 'male', 'db', '60', 'to', '15', 'meter', '10', 'cable', '72', '0789', '01', 'cab', 'x21mt', 'hp', '650', 'usb', 'board', '01016yy00', '600', 'inspector', 'parker', 'betrapped', 'bundle', 'pc', 'cd', 'brand', 'new', 'sealed', 'oem', 'box', 'hotpoint', 'wmpg762guk', 'washing', 'machine', 'drum', 'paddle', 'hole', 'rug']
71706


In [29]:
cv = CountVectorizer(min_df=2)
cv.fit(X_tr)
print(list(cv.vocabulary_)[:50])
print(len(cv.vocabulary_))

['coco', 'l9338', 'ladies', 'black', 'flat', 'knee', 'high', 'pull', 'on', 'boots', 'sale', 'cisco', 'male', 'db', '60', 'to', '15', 'meter', '10', 'cable', '72', '01', 'cab', 'hp', '650', 'usb', 'board', '600', 'inspector', 'parker', 'bundle', 'pc', 'cd', 'brand', 'new', 'sealed', 'oem', 'box', 'hotpoint', 'wmpg762guk', 'washing', 'machine', 'drum', 'paddle', 'hole', 'rug', 'shaggy', 'thick', 'luxury', 'pile']
29244


In [30]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(X_tr)
print(list(cv.vocabulary_)[:50])
print(len(cv.vocabulary_))

['coco', 'l9338', 'ladies', 'black', 'flat', 'knee', 'high', 'pull', 'on', 'boots', 'sale', 'coco l9338', 'l9338 ladies', 'ladies black', 'black flat', 'flat knee', 'knee high', 'high pull', 'pull on', 'on boots', 'boots sale', 'cisco', 'male', 'db', '60', 'to', '15', 'meter', '10', 'cable', '72', '0789', '01', 'cab', 'x21mt', 'cisco male', 'male db', 'db 60', '60 to', 'to male', 'db 15', '15 meter', 'meter 10', '10 cable', 'cable 72', '72 0789', '0789 01', '01 cab', 'cab x21mt', 'hp']
477233


In [37]:
import re

def clean_text(txt):
    txt_ = txt
    txt_ = re.sub(r'[A-Za-z]+[0-9]+',r'',txt_)
    txt_ = re.sub(r'[0-9]+',r'',txt_)
    return txt_

sample="Greg01 00034234 023F FF"
clean_text(sample)

cv = CountVectorizer(preprocessor=clean_text)
cv.fit(X_tr)
print(list(cv.vocabulary_)[:50])
print(len(cv.vocabulary_))

['Coco', 'Ladies', 'Black', 'Flat', 'Knee', 'High', 'Pull', 'On', 'Boots', 'Sale', 'Cisco', 'Male', 'DB', 'to', 'Meter', 'Cable', 'CAB', 'MT', 'HP', 'USB', 'Board', 'Inspector', 'Parker', 'Betrapped', 'Bundle', 'PC', 'CD', 'BRAND', 'NEW', 'SEALED', 'OEM', 'BOX', 'Hotpoint', 'GUK', 'Washing', 'Machine', 'Drum', 'Paddle', 'Hole', 'RUG', 'SHAGGY', 'THICK', 'LUXURY', 'PILE', 'SILVER', 'GREY', 'CM', 'New', 'NGK', 'Ignition']
56198


<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.vocabulary_)[:10])
    print(len(vect.vocabulary_))

</div>


Stemming
------------------

Linguistic normalization in which variant forms are reduced to a common form

    connection
    connections
    connective     --->   connect
    connected
    connecting
    
Usage:

    import snowballstemmer

    stemmer = snowballstemmer.stemmer('english')
    print(stemmer.stemWords("We are the world".split()))

In [87]:
import snowballstemmer

stemmer = snowballstemmer.stemmer('english')
print(" ".join(stemmer.stemWords("We are the worlds".split())))

We are the world


**How to make preprocessing faster?**

There is map a function.
Below there is a comparison of three methods from the most naive to the fastest. 

Using a loop

In [38]:
%%time
preprocessed_naive = X_tr[:]
for n in range(preprocessed_naive.shape[0]):
    preprocessed_naive[n] = clean_text(preprocessed_naive[n])

CPU times: user 908 ms, sys: 8 ms, total: 916 ms
Wall time: 918 ms


Using ```map``` function 

In [39]:
%%time 
preprocessed_map = map(clean_text, X_tr)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16 µs


Using ```pool.map``` function 

In [49]:
%%time
from multiprocessing import Pool

pool = Pool(5)
preprocessed_poolmap = pool.map(clean_text, X_tr)
pool.close()

CPU times: user 68 ms, sys: 124 ms, total: 192 ms
Wall time: 375 ms


Putting it into a pipeline
----------------------

Now that we know how to transform text data, let's put it into a pipeline.

1. Create a pipeline with `CountVectorizer`, `StandardScaler` and `SGDClassifier` as your final algorithm
    a) use alternative format for pipeline definition when you name the steps - refer to the documentation how to do this
2. Using ```sklearn.metrics.classification_report``` create a report about your classifier

In [88]:
##########################
# put your solution here #
##########################

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
import snowballstemmer

stemmer = snowballstemmer.stemmer('english')

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    t = " ".join(stemmer.stemWords(t.split()))
    return t

pipeline = Pipeline([
                ('count', CountVectorizer(min_df=5,preprocessor=clean_text)),
                ('scale', StandardScaler(with_mean=False)),
                ('classify', SGDClassifier())
            ])
"""
preds = cross_val_predict(pipeline, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)
                          """

'\npreds = cross_val_predict(pipeline, \n                          X_tr, \n                          y_tr, \n                          cv=8, n_jobs=-1, verbose=True)\n                          '

In [60]:
from sklearn.metrics import classification_report

def add_padding(t):
    return t.rjust(30)

print(classification_report(list(map(add_padding, y_tr)), 
                            list(map(add_padding, preds))))

                                precision    recall  f1-score   support

                           Art       0.58      0.51      0.54       519
                          Baby       0.69      0.56      0.62       640
                         Music       0.89      0.85      0.87      1205
                        Crafts       0.81      0.74      0.78      3040
                        Stamps       0.58      0.70      0.63        80
                        Travel       0.00      0.50      0.00         2
                      Antiques       0.28      0.29      0.28       182
                  Collectibles       0.70      0.60      0.65      2109
                  Pet Supplies       0.76      0.62      0.68       573
                  Toys & Games       0.79      0.68      0.73      2654
                 Dolls & Bears       0.81      0.75      0.78       253
                Garden & Patio       0.75      0.61      0.68       931
                Sound & Vision       0.73      0.61      0.67  

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', GridSearchCV(SGDClassifier(), param_grid={"alpha":[0.1,0.01,0.001]}))])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

def add_padding(t):
    return t.rjust(30)

print(classification_report(list(map(add_padding, y_tr)), 
                            list(map(add_padding, preds))))


</div>


Grid search
--------------------------

Scikit-learn has `GridSearchCV` and `RandomizedSearchCV`. Both have the same functionality and can be used to find good parameters for the models. What is great about both these classes that they are both transformers - they return an estimator so you can chain them and put in your pipeline.

**GridSearchCV** - you specify the exact values of the parameters you want to test
**RandomizedSearchCV** - you specify ranges of parameters

Exercise
----------------------

1. Use `GridSearchCV` or `RandomizedSearchCV` to find the best parameters for the models. Check at least 2 parameters.

In [89]:
##########################
# put your solution here #
##########################

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()
"""
params = {'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'count__analyzer': ["word","char"],
          'count__binary': [True, False]}
"""
params = {'count__ngram_range': [(1,7)],
          'count__analyzer': ["char"],
          'count__binary': [True]}

grid_clf = GridSearchCV(pipeline, params, n_jobs=-1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)


Grid search

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished


0.864 {'count__analyzer': 'char', 'count__binary': True, 'count__ngram_range': (1, 7)}


In [None]:
for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'count__analyzer': ["word","char"],
          'classify__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(pipeline, params, n_jobs=-1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

</div>


Useful materials

1. http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
2. http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html