In [177]:
# Web Scraping
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

# FAQ_ID labelling
import pandas as pd 
import numpy as np
from sklearn.cluster import OPTICS
from sklearn.preprocessing import MaxAbsScaler

starting_urls = ['https://support.n26.com/de-de',
                 'https://support.n26.com/de-at',
                 'https://support.n26.com/en-at',
                 'https://support.n26.com/en-de',
                 'https://support.n26.com/en-it',
                 'https://support.n26.com/it-it',
                 'https://support.n26.com/en-eu',
                 'https://support.n26.com/en-fr',
                 'https://support.n26.com/fr-fr',
                 'https://support.n26.com/en-es',
                 'https://support.n26.com/es-es',
                 'https://support.n26.com/en-us',
                 'https://support.n26.com/en-gb'
                ]

# Problem statement

The task is to find some data from a neobank named N26, and to match questions across languages. It's in affect a web scraping and clustering problem, which affected how this was approached. 

# Scraping the FAQ data from the helpdesk of N26 across all markets (40%)


### Crawl (40%)


In [178]:
# If stuck, use the below csv
extracted_n26 = pd.read_csv('../data/extracted_n26.csv', index_col=0)
extracted_data = extracted_n26.values.tolist()

extracted_n26_no_duplicates = extracted_n26.drop_duplicates(subset=['content'])

In [179]:
extracted_n26.head()


Unnamed: 0,url,market,title,content
0,https://support.n26.com/de-at/app-und-produkte...,de-at,Was sind Spaces?,"Setze dir Ziele, erstelle, personalisiere und ..."
1,https://support.n26.com/en-es/account-and-pers...,en-es,How to download my personal data?,The General Data Protection Regulation (GDPR) ...
2,https://support.n26.com/de-at/zahlungen-ueberw...,de-at,Wie lange dauert eine Überweisung mit Transfer...,Wenn du eine Überweisung mit TransferWise mach...
3,https://support.n26.com/en-de/app-and-features...,en-de,How to manage my visibility as an N26 user?,To grant you access to all our in-app features...
4,https://support.n26.com/en-de/payments-transfe...,en-de,How often can I withdraw cash for free?,Cash withdrawals using your Mastercard depend ...


In [180]:
print('df length:', len(extracted_n26))
print('df length without duplicates:',len(extracted_n26_no_duplicates))

df length: 1412
df length without duplicates: 654


In [181]:
%%time

# set up dataframe for use in crawling
df = pd.DataFrame(columns=['url', 'market', 'title', 'content'])


# crawl, extract and record relevant information

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
i = 0
errors = []

for url in tqdm(starting_urls):
    
    # get faq links from page (it's only the first 6 that we need)
    page = requests.get(url, headers=headers)
    faq_soup = BeautifulSoup(page.content, 'html.parser')
    market = url[-5:]
    faq_links = [link['href'] for link in faq_soup.find_all('a') if link['href'].startswith('/'+ market +'/')][:6]
    
    # get titles and title links links from faq links
    for link in faq_links:
        q_page = requests.get(url+link[6:], headers) 
        q_soup = BeautifulSoup(q_page.content, 'html.parser')
        q_links = [link['href'] for link in q_soup.find_all('a') 
                   if link['href'].startswith('/'+ market +'/')
                   and link.string.endswith('?')]
        titles = [link.string for link in q_soup.find_all('a')
                       if link.string is not None and link.string.endswith('?')]
        
        # get answers from question links and put everything into the dataframe set up earlier
        for q, q_link in enumerate(q_links):
            try:
                a_page = requests.get(url + q_link[6:], headers)
                a_source = BeautifulSoup(a_page.content, 'html.parser')
                a = a_source.find(id = 'main').get_text().split('?')[1:][0]
                df.at[i,'url'] = url + q_link[6:]
                df.at[i,'market'] = market
                df.at[i,'title'] = titles[q]
                df.at[i,'content'] = a
                i= i +1
            except AttributeError as error:
                errors.append([f'{error} for url: {url + q_link[6:]}'])
                continue
  



100%|██████████| 13/13 [52:03<00:00, 240.24s/it] 

Wall time: 52min 3s





In [182]:
df_no_duplicates = df.drop_duplicates(subset=['content'])

In [183]:
df.head()

Unnamed: 0,url,market,title,content
0,https://support.n26.com/de-de/konto-und-person...,de-de,Kann ich in meinem Land ein N26 Konto eröffnen?,Wir bieten unsere Konten in folgenden Ländern ...
1,https://support.n26.com/de-de/konto-und-person...,de-de,"Kann ich ein Konto eröffnen, wenn ich außerhal...","Wenn du in Polen, Schweden, Dänemark, Norwegen..."
2,https://support.n26.com/de-de/konto-und-person...,de-de,Wie eröffne ich mein N26 Konto?,Du kannst ein N26 Konto in der App (auf deinem...
3,https://support.n26.com/de-de/konto-und-person...,de-de,Warum funktioniert meine Video-Verifizierung n...,Einige Tipps für eine erfolgreiche Video-Verif...
4,https://support.n26.com/de-de/konto-und-person...,de-de,Wie kann ich mich ausweisen?,"Als Bank müssen wir wissen, wer du bist. Desha..."


In [184]:
print('df length:', len(df))
print('df length without duplicates:',len(df_no_duplicates))

df length: 1124
df length without duplicates: 580


In [185]:
print('Number of errors:',len(errors))
print('Error example:',errors[1])

Number of errors: 5
Error example: ["'NoneType' object has no attribute 'get_text' for url: https://support.n26.com/en-eu/security/passwords-and-codes/why-didnt-i-receive-my-pairing-code-via-sms"]


# Finding closest matches (60%)

Finding questions that match from a group of questions is effectively a clustering problem. As it is a clustering problem, we'll need some way of turning the text into something that can be 'clustered'; in machine learning, clustering means finding the distances in vector space between vectors, and giving the same label to vectors within a given distance. 

The workflow will look like this:



1. <b> Get universal encoder from tensorflow </b> - The FAQs are in different languages. The Universal Sentence Encoder will help us to understand the context around words, even in different languages. 
2.<b> _Embed the question answers_ </b>- In order to cluster our answers, we'll use the encoder to represent our answers in vector form. There will be 512 numbers in each vector, corresponding to 512 distances that describe our answers.
3. <b>_Scale the embeddings_ </b>- Using sentences for text embedding often results in the embeddings becoming diluted: it's harder to tell the meaning of sentences apart from each other than it is for words, and this will reflect in the embedding. Scaling the embeddings will preserve the relationship between the embeddings mathematically, whilst making the differences more stark.
4. <b>_Cluster the embeddings_ </b>- scikit-learn has many clustering algorithms. OPTICS and DBScan are optimised for high dimensional data, and OPTICS more so according to the [docs](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html).
5. <b>_Attach the cluster labels to the FAQs_</b> - We'll put all this work into a dataframe to be exported.

We'll also look at how good the culstering is when there are no repeated questions, as this should force cross language matching better

### Get the encoder from Tensorflow Hub

Step 1. In order to match the question answers to each other, the universal sentence encoder provided by tensorflow was used to create vector representations of each sentence on the data set (i.e. to create sentence embeddings). We can get the universal sentence encoder from the tensorflow hub.

In [186]:
%%time

# for the purposes of the interview we stongly encourage you to use the universal sentence encoder
# The following code will setup everything you need and setup the encoder for you

# For full disclaimer this code has been taken from
# Semantic Similarity with TF-Hub Universal Encoder at tf hub

import tensorflow as tf

import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print (f"module {module_url} loaded")
def embed(input):
    return model(input)









module https://tfhub.dev/google/universal-sentence-encoder/4 loaded
Wall time: 28.6 s


The answers from the scraped data dataframe will provide the basis for the embedding. Lets embed our answers and have a look at what we get.

In [328]:
%%time

messages = df.sample(n=3).content
message_tensors = embed(messages)

for i, tensor in enumerate(message_tensors[:3]):
    print(f'Start of message: {messages.to_list()[i].split(".")[0]} \n')
    print(f'Embedding size: {len(tensor)}')
    print(f'First 3 embeddings: {tensor[:3]}')
    print('Standard deviation:', np.std(tensor))
    print('\n'*2)

Start of message: 3D Secure est une technologie de sécurisation des paiements en ligne visant à limiter l’utilisation frauduleuse des cartes bancaires sur internet 

Embedding size: 512
First 3 embeddings: [-0.06545603 -0.06586096  0.008463  ]
Standard deviation: 0.044067346



Start of message: If you hold an US account with N26 Inc 

Embedding size: 512
First 3 embeddings: [-0.05529587 -0.03282806  0.05686469]
Standard deviation: 0.044184644



Start of message: If you suspect possible fraudulent activity on your account, please contact us as soon as possible 

Embedding size: 512
First 3 embeddings: [-0.06628229  0.00125737  0.0411117 ]
Standard deviation: 0.044156652



Wall time: 45 ms


As expected, the distances from answer to answer are really, really small. The standard deviation is comparable to the embedding values, implying the answers within the vector space aren't very well defined. This will drastically affect the quality of the clusters extracted, but we should still be able to get some reasonable results (maybe).

The following function will implement steps 2 to 5.

In [305]:
%%time

def match_qas(qas_across_markets_df):
    """    
    This function assigns an faq_id to the input data, thus grouping question
    answer pairs across languages. A locale is the language of a market
    
    Parameters:
       qas_across_markets (list[
                         (market_1,url_1,title_1,content_1),
                         (market_2,url_2,title_2,content_2)])
    
    
    Returns:
       matched_data (list[
                         (faq_id_1,locale_1,market_1,title_1,content_1),
                         (faq_id_1,locale_2,market_2,title_2,content_2)])
    """
    
    # The data has structure (questions with the same language will be grouped together, due to the structure of the site).
    # This structure may interfere with our clusters, so we shuffle the data to fix that.
    shuffled_df = qas_across_markets_df.sample(frac=1, random_state=42)
    
    # Step 2
    embeddings = embed(shuffled_df.content)
    
    # Step 3
    # MaxAbsScaler is used for no particular reason other than it's slightly simpler than the alternatives,
    # and plays nicer with OPTICS.
    scalar = MaxAbsScaler()
    scaled_embeddings = scalar.fit_transform(embeddings)
    
    # Step 4
    # OPTICS is used to cluster the data for reasons mentioned earlier.
    clusters = OPTICS(n_jobs=-1, min_cluster_size=4, metric='cosine' ).fit(scaled_embeddings)
    
    # Step 4
    # The labels from the clusters are put into a dataframe along with the rest of the relevant data.
    locale_market = shuffled_df.market.str.split(pat='-', expand  = True)

    data_dict = {'FAQ_id':clusters.labels_+1 , 'locale':locale_market[0], 'market': locale_market[1],
             'title':shuffled_df.title.to_list(), 'content':shuffled_df.content.to_list()}
    
    matched_data = pd.DataFrame.from_dict(data_dict)
    return matched_data
    

test_results = match_qas(extracted_n26)
final_results = match_qas(df)

# Add a version with the duplicates dropped. This should reduce the amount of clusters 
# and better match questions across languages as some redundancy is removed 

test_results_2 = match_qas(extracted_n26_no_duplicates)
final_results_2 = match_qas(df_no_duplicates)

  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Wall time: 7min 30s


In [306]:
print('No of faq ids for given data:',len(test_results.FAQ_id.unique()))
print('No of faq ids for given data without duplicates:',len(test_results_2.FAQ_id.unique()))
print('\n')
print('The amount of FAQ IDs found for scraped data:',len(final_results.FAQ_id.unique()))
print('The amount of FAQ IDs found for scraped data without duplicates:',len(final_results_2.FAQ_id.unique()))

No of faq ids for given data: 117
No of faq ids for given data without duplicates: 13


The amount of FAQ IDs found for scraped data: 101
The amount of FAQ IDs found for scraped data without duplicates: 7


In [324]:
# clusters 1 to 4 showed some question matching
final_results_2.loc[final_results_2.FAQ_id == 3]

Unnamed: 0,FAQ_id,locale,market,title,content
223,3,en,at,How to order my card with express delivery?,Express delivery is a good option if you need ...
817,3,fr,fr,Quand est-ce que je recevrai ma carte ?,Voici les délais de livraison normaux une fois...
222,3,en,at,When will my card arrive?,The estimated delivery timeline for card deliv...
818,3,fr,fr,Comment commander ma carte en livraison express ?,Disponibilité limitée.La livraison express est...
819,3,fr,fr,Comment commander une nouvelle carte ?,ℹ️ La livraison de votre carte peut prendre ju...


In this notebook we attempted to match questions across languages by clustering them with a scikit-learn implementation of OPTICS, a density-based clustering algorithm tuned to high-dimensional data such as ours. We met VERY limited success, due to the curse of dimensionality and the narrowness of the topic at hand.

There should have been a cluster for every question from the 6 FAQ links scraped, but instead only 7 were found 

Some matching between French and English was made, probably because they're the two closest languages to each other. The clustering seemed mainly to focus on the language/words used and not the context of the words, resulting in the massive bucket of left-over content in the last bucket.

Some tuning of OPTICS clustering would improve this a lot, but after a few different parameters were tried and tested the results didn't get any better or more defined than they are now. Perhaps another approach could produced a more informative clustering.

## Saving the data frames

We'll save the scraped data that was used for machine learning, and the two results of machine learning, for comparison.

In [325]:
extracted_data_fname = 'extracted_data_n26.csv'
final_results_fname = 'n26_with_faq_.csv'
test_results_fname = 'result_with_given_data.csv'

In [326]:
df.to_csv(extracted_data_fname)
final_results_2.to_csv(final_results_fname)
test_results_2.to_csv(test_results_fname)

In [327]:
# Dump file in a csv called n26_with_faq_