# Lab -- Pipelines

In this assignment, you will modify an sklearn pipeline that classifies whether a url is a phish.
You will:
* modify a url processor to extract the path as its own column
* enhance a domain feature transformer to extract additional domain features
* create a path feature transformer to extract features based on the url path
* add an additional pipeline to the provided ColumnTransformer that integrates your Path transformer

Resources:

* Much of the code in this started from https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html#classification-pipeline
* Also see https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#examples-using-sklearn-pipeline-pipeline

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

import cloudpickle

import datetime

In [2]:
#!wget 'https://research.aalto.fi/files/16859732/urlset.csv.zip'
#!unzip urlset.csv.zip
df = pd.read_csv('urlset.csv', encoding_errors='ignore', on_bad_lines='skip')

# select just the two columns we care about, and then drop na's before taking a train-test split
df = df[['domain','label']]
df = df.dropna()

X, y = df[['domain']], df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

  df = pd.read_csv('urlset.csv', encoding_errors='ignore', on_bad_lines='skip')


In [3]:
# Create a function that ex
def url_processor(data):
    ''' 
    Expects a pandas dataframe with column 'domain', where the domain actually is a full url. (The 
    source dataset calls it `domain`.) 
    
    Returns a new dataframe with features extracted from the url. 
    '''
    
    # extract the domain as being everything before the first `/`
    domain = data['domain'].str.split('/').str[0]
    
    # Add another column called "path" for everything after the first "/", but before any query string (before any `?`, if any)
    #
    # * Hint: There might be multiple `/` in the path, so you can't just modify the code above to take `.str[1]`.
    #   To deal with this, see `n` argument here: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html
    # * Replace all null-values in with an empty string. 
    #   See https://pandas.pydata.org/docs/reference/api/pandas.Series.fillna.html
    
    path = data['domain'].str.split('/',1).str[1].fillna('')
    path = path.str.split('?').str[0]
    
    # DataFrame constructor accepts a dict of column-names mapped to pandas series.
    return pd.DataFrame(
        {'domain': domain, 'path': path}
    )

url_processor_transformer = FunctionTransformer(url_processor)

In [4]:
def domain_features(domains):
    '''Expects a list of domains, and returns a list of dicts (one row per domain) with new features'''
    return [
        {
            "length": len(domain),
            
            # add a second feature here counting the number of `.`'s in the domain
            "count_of_dots": domain.count('.'),
            
            # add a third feature here for the domain's TLD (DictVectorizer will handle one-hot encoding for us)
            "tld": domain.rsplit('.',1)[-1]
        } for domain in domains
    ]

domain_features_transformer = FunctionTransformer(domain_features)

In [5]:
# Create another transformer for the `path` feature. 
# Follow the pattern of the `domain_features` cell above

def path_features(paths):
    # create two features --
    # * `path_length`, as the length of the `path
    # * `count_of_slash, as the count of the number of `/` characters in the path
    return [
        {
            "path_length": len(path),

            # add a second feature here counting the number of `.`'s in the domain
            "count_of_slash": path.count('/')
        } for path in paths
    ]

path_features_transformer = FunctionTransformer(path_features)

In [6]:
pipeline = Pipeline(
    [
        ( "prep", url_processor_transformer),
        # Use ColumnTransformer to combine the subject and body features
        (
            "union",
            ColumnTransformer(
                [
                    # Pipeline for processing features from the 'domain' column
                    (
                        "domain_features",
                        Pipeline(
                            [
                                (
                                    "features",
                                    domain_features_transformer,
                                ),  # returns a list of dicts
                                (
                                    "vect",
                                    DictVectorizer(),
                                ),  # list of dicts -> feature matrix
                            ]
                        ),
                        'domain',
                    ),
                    # Add another pipeline here for extracting features from the 'path' column.
                    # Follow the example of the previous tuple in this list.
                    
                    (
                        "path_features",
                        Pipeline(
                            [
                                (
                                    "features",
                                    path_features_transformer,
                                ),  # returns a list of dicts
                                (
                                    "vect",
                                    DictVectorizer(),
                                ),  # list of dicts -> feature matrix
                            ]
                        ),
                        'path',
                    ),
                ],
            ),
        ),
        # Use a SVC classifier on the combined features
        ("svc", LinearSVC(dual=False)),
    ],
    verbose=True,
)

In [7]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))

[Pipeline] .............. (step 1 of 3) Processing prep, total=   1.4s
[Pipeline] ............. (step 2 of 3) Processing union, total=   2.9s
[Pipeline] ............... (step 3 of 3) Processing svc, total=   2.6s
Classification report:

              precision    recall  f1-score   support

         0.0       0.75      0.86      0.80     11951
         1.0       0.83      0.71      0.77     12028

    accuracy                           0.79     23979
   macro avg       0.79      0.79      0.78     23979
weighted avg       0.79      0.79      0.78     23979



In [8]:
# Pickle using CloudPickle, which will also pick up our transformer functions and whatnot
# Append the current timestamp to the filename so I stop forgetting which one is the
# most recent.

timestamp = int(datetime.datetime.now().timestamp())

with open(f'phish-model-{timestamp}.cloudpickle', 'wb') as f:
    cloudpickle.dump(pipeline, f)