In [None]:
import json
import spacy
from spacy.matcher import Matcher

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

import hashlib

In [None]:
docs_ = []
with open('datasets/sample_data.json') as fp:
    for line in fp:
        entry = json.loads(line)
        docs_.append(entry['body'])

print('Loaded {} documents'.format(len(docs_)))

In [None]:
nlp = spacy.load('en_core_web_md', disable=['parser'])

In [None]:
docs = list(nlp.pipe(docs_))

## Q1 - Feature Extraction with spacy I

You are working with text and it is important to known **how much data can the text provide you with**. In these exercises you will approach techniques that extract information from text data as well as to use them to improve your machine learning classifier.
To do this, we are going to start working with our sample dataset from Reddit where you'll be asked to extract some bits of knowledge.

In this first problem you are given a list of four names and your task is to find which one is not present in the given set of documents. Take advantage of spacy's `Matcher`.

In [None]:
names = ['Tom', 'John', 'Teresa', 'Christian']

In [None]:
#matcher = Matcher(...)
#
#for name in names:
#    pattern = [...]
#    matcher.add(...)
#
#for doc in docs:
#    matches = ...
#    for ...:
#        span = ...
#        print(span)

# YOUR CODE HERE
raise NotImplementedError()

So, which of the names is not in the corpus of documents?

In [None]:
#answer = ...

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
eh = '81f3bf42a93cf18dece9321ac5c93313126eb5ca92164d74643e4cbf60ecde9c'
assert hashlib.sha256(str(answer).encode()).hexdigest() == eh

## Q2 - Feature extraction with spacy II

The second problem is fairly similar, but in this case you'll have to count the number of ocurrences that spacy interprets as being URLs. Looking at the following figure should help you choose the pattern to use now.

![](media/token_attributes.png)

In [None]:
#matcher = Matcher(...)
#
#pattern = [...]
#matcher.add(...)
#
#count = ...
#for doc in docs:
#    matches = ...
#    count += ...
        
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
eh = '2abaca4911e68fa9bfbf3482ee797fd5b9045b841fdff7253557c5fe15de6477'
assert hashlib.sha256(str(count).encode()).hexdigest() == eh

## Q3 - Extracting Part of Speech features

Now you are going to check how reliable the vanilla part of speech tagging from spacy is. To do that, first, match all the adjectives and then look at the five most common.

To help you, here's the list of PoS available in spacy:

![](media/pos_helper.png)

In [None]:
#matcher = ...
#pattern = [...]
#matcher.add(...)
#
#adjs = list()
#
#for doc in docs:
#    matches = ...
#    for ...:
#        span = ...
#        adjs.append(str(span).lower())

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
from collections import Counter
Counter(adjs).most_common(5)

So, does it look like the most common adjectives are indeed adjectives?

- [ ] Yes
- [ ] No

In [None]:
#answer = "Yes" or "No"

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
eh = '1ea442a134b2a184bd5d40104401f2a37fbc09ccf3f4bc9da161c6099be3691d'
assert hashlib.sha256(str(answer).encode()).hexdigest() == eh

## Q4 - Extracting "complex" patterns

How many people are going to somewhere productive in these documents? (Count instances of sentences with the structure verb with lemma go, followed by an adposition and one of the places listed).

Note: Note that this is a very simple heuristic and that a setup like this is only reasonable for a problem in which you are getting familiar with spacy.

In [None]:
places = ['School', 'College', 'Library']

#matcher = ...
#
#for place in places:
#    pattern = [..., ..., ...]
#    matcher.add(...)
#
#count = ...
#    
#for doc in docs:
#    matches = ...
#    count += ...

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
eh = '4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'
assert hashlib.sha256(str(count).encode()).hexdigest() == eh

## Q5 - Adding Extra Features

You are given the task to build a better spam classifier. One possible factor that may help is to know the number of adjectives in each sms, as the number of verbs used and the length of the messages. 

Add extra fields to your dataframe with the count for the number of adjectives (ADJ) and verbs (VERB) Spacy recognized for each sms, as well with the length of the message (number of characters).

How many adjectives and verbs do we have in total?

In [None]:
df = pd.read_csv('datasets/spam.csv', encoding='latin1')
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1,inplace=True)
df.rename(columns={"v1":"label", "v2":"message"},inplace=True)
df = df[:3000]

In [None]:
# Hint: you can iterate over the tokens in Spacy doc to know each PoS tag for each token
# for doc in nlp.pipe(df['message']):
#    for token in doc:
#        print(token.pos_)

#n_adj = []
#n_verbs = []
#len_message = []
#
#for doc in nlp.pipe(df['message']):
#    n_adj.append(...)
#    n_verbs.append(...)
#    len_message.append(...)
#    
#df['n_adj'] = ...
#df['n_verbs'] = ...
#df['len_message'] = ...

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
total_adjs = sum(n_adj)
total_verbs = sum(n_verbs)
assert np.allclose(total_adjs, 3863, 20)
assert np.allclose(total_verbs, 10696, 20)

## Q6 - Feature Unions

Now we have these features, can we build a classifier using them? 

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
def get_accuracy(feats, train_data, test_data):

    pipeline = Pipeline([
        ('features',feats),
        ('classifier', RandomForestClassifier(random_state = 42)),
    ])

    pipeline.fit(train_data, train_data.label)

    preds = pipeline.predict(test_data)
    accuracy = np.mean(preds == test_data.label)
    
    print("Accuracy: {:.4f}".format(accuracy))
    
    return accuracy

**a)** Use FeatureUnion to join text features extracted from a standard TfidfVectorizer with the numeric feature for the counts of adjectives in the messages.

In [None]:
#text = Pipeline(...)
#adj =  Pipeline(...)
#feats = FeatureUnion(...)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
accuracy = get_accuracy(feats, train_data, test_data)
assert np.allclose(accuracy, 0.9683, 0.01)

**b)** Now add the number of verbs and see if the accuracy improves. You should notice that more features doesn't always mean better accuracy, as you may have seen previously during the LDSSA.

In [None]:
#text = Pipeline(...)
#adj =  Pipeline(...)
#verbs = Pipeline(...)
#feats = FeatureUnion(...)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
accuracy = get_accuracy(feats, train_data, test_data)
assert np.allclose(accuracy, 0.9583, 0.01)

**c)** Finally make use of the length of the messages as well and see whether it improves the model accuracy or not.

In [None]:
#text = Pipeline(...)
#adj =  Pipeline(...)
#verbs = Pipeline(...)
#len_message = Pipeline(...)
#feats = FeatureUnion(...)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
accuracy = get_accuracy(feats, train_data, test_data)
assert np.allclose(accuracy, 0.9717, 0.01)