In [None]:
%matplotlib inline

import os
import json 
import requests

import pandas as pd
import numpy as np

In [None]:
## Download Data
BASE = os.path.abspath(os.path.join('.', 'data'))
URL  = "http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"

def fetch_file(url=URL, name='online_retail.xlsx', base=BASE):
    """
    Fetch the data from the url and save it to the name in the base directory.
    """
    # Construct the path to save the data to 
    path = os.path.join(base, name)
    if os.path.exists(path): raise Exception("Data is already donwloaded!")
    
    # Fetch the URL qith requests.py
    response = requests.get(url)
    response.raise_for_status() 
    
    with open(path, 'w') as f:
        f.write(response.content)

# Uncomment below if you want to fetch the data 
# fetch_file()

In [None]:
DATA = os.path.join(BASE, 'online_retail.xlsx')

## Load the data frame for data exploration 
df = pd.read_excel(DATA)

In [None]:
df.Description = df.Description.replace(np.nan,' ', regex=True)
df.Description = df.Description.astype(unicode)

In [None]:
df

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin 

class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.column = column 
        
    def fit(self, X, y=None):
        return self 

    def transform(self, df):
        """
        Expects a dataframe that contains the specified column. Returns only that column.
        """
        if self.column not in df.columns:
            raise Exception("Could not extract the '{}' column!".format(self.column))
        return df[self.column].values

In [None]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.classes_ = np.unique(X)
        return self

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def transform(self, X):
        classes = np.unique(X)
        if len(np.intersect1d(classes, self.classes_)) < len(classes):
            diff = np.setdiff1d(classes, self.classes_)
            raise ValueError("y contains new labels: %s" % str(diff))
        return np.searchsorted(self.classes_, X)

    def inverse_transform(self, X):
        diff = np.setdiff1d(X, np.arange(len(self.classes_)))
        if diff:
            raise ValueError("y contains new labels: %s" % str(diff))
        X = np.asarray(X)
        return self.classes_[X]
    

class TwoD(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self 

    def transform(self, X):
        return np.asarray([
            [x] for x in X
        ])

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.neighbors import NearestNeighbors


# Set up a feature extraction pipeline. 
model = Pipeline([
    ('columns', FeatureUnion(
        transformer_list = [
            
            ('description', Pipeline([
                ('select', ColumnSelector('Description')), 
                ('tfidf', TfidfVectorizer()), 
            ])), 
                    
            ('country', Pipeline([
                ('select', ColumnSelector('Country')), 
                ('vect', LabelEncoder()),
                ('2d', TwoD()),
            ])), 
                    
            ('price', Pipeline([
                ('select', ColumnSelector('UnitPrice')), 
                ('2d', TwoD()),
                ('impute', Imputer(np.nan, 'mean')),
            ])),
                    
            ('customer', Pipeline([
                ('select', ColumnSelector('CustomerID')),
                ('2d', TwoD()),
                ('impute', Imputer(np.nan, 'mean')),
            ])),
                    
            
        ],
                
        transformer_weights = {
            'description': 0.4, 
            'country': 0.1, 
            'price': 0.1, 
            'customer': 0.4,
        },
    )),
        
    ('neighbors', NearestNeighbors(k=15)), 
    
])

model = model.fit(df)

In [None]:
import pickle 

model_path = os.path.join(BASE, 'neighbors.pickle')
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

In [None]:
model.steps[-1][-1].kneighbors_graph()