# **WORD SIMILARITY**

In [125]:
import numpy as np
import pandas as pd
from itertools import combinations

The problem is to match the user's free-form input against a pre-determined list of banks. For example, user input 'bawag bank' should be matched to 'BAWAG Group AG'.

In [126]:
# List of banks to compare
banks =   ['Sberbank Europe AG',
          'BAWAG Group AG',
          'Raiffeisenbankengruppe OÖ Verbund eGen',
          'Raiffeisen Bank International AG',
          'Volksbanken Verbund',
          'Erste Group Bank AG',
          'KBC Groep',
          'Investeringsmaatschappij Argenta',
          'Belfius Bank',
          'AXA Bank Belgium',
          'The Bank of New York Mellon SA/NV',
          'First Investment Bank AD',
          'RCB Bank Ltd',
          'Bank of Cyprus Holdings Public Limited Company',
          'Hellenic Bank Public Company Limited',
          'DekaBank Deutsche Girozentrale',
          'Erwerbsgesellschaft der S-Finanzgruppe mbH & Co. KG',
          'UBS Europe SE',
          'DEUTSCHE APOTHEKER- UND ÄRZTEBANK EG',
          'Volkswagen Bank Gesellschaft mit beschränkter Haftung',
          'Münchener Hypothekenbank eG',
          'DZ BANK AG Deutsche Zentral-Genossenschaftsbank, Frankfurt am Main',
          'HASPA Finanzholding',
          'State Street Europe Holdings Germany S.a.r.l. & Co. KG',
          'J.P. Morgan AG',
          'DEUTSCHE BANK AKTIENGESELLSCHAFT',
          'COMMERZBANK Aktiengesellschaft',
          'Landesbank Baden-Württemberg',
          'Landesbank Hessen-Thüringen Girozentrale',
          'Norddeutsche Landesbank - Girozentrale -',
          'Deutsche Pfandbriefbank AG',
          'Aareal Bank AG',
          'Hamburg Commercial Bank AG',
          'Bayerische Landesbank',
          'Jyske Bank A/S',
          'Sydbank A/S',
          'Nykredit Realkredit A/S',
          'Danske Bank A/S',
          'Luminor Holding AS',
          'Abanca Corporacion Bancaria S.A.',
          'Banco Santander S.A.',
          'Ibercaja Banco S.A.',
          'Kutxabank S.A',
          'Unicaja Banco S.A.',
          'CaixaBank S.A.',
          'Banco de Crédito Social Cooperativo',
          'Banco Bilbao Vizcaya Argentaria S.A.',
          'Banco de Sabadell S.A.',
          'Bankinter S.A.',
          'Kuntarahoitus Oyj',
          'Nordea Bank Abp',
          'OP Osuuskunta',
          'SFIL',
          'RCI Banque',
          'Confédération Nationale du Crédit Mutuel',
          'La Banque Postale',
          'Bpifrance',
          "C.R.H. - Caisse de refinancement de l'habitat",
          'HSBC Continental Europe',
          'Groupe BPCE',
          'Groupe Crédit Agricole',
          'Société générale',
          'BNP Paribas',
          'ALPHA SERVICES AND HOLDINGS S.A.',
          'National Bank of Greece S.A.',
          'Eurobank Ergasias Services and Holdings S.A.',
          'Piraeus Financial Holdings',
          'OTP-csoport',
          'Magyar Bankholding',
          'Barclays Bank Ireland plc',
          'Citibank Holdings Ireland Limited',
          'AIB Group plc',
          'Bank of Ireland Group plc',
          'Ulster Bank Ireland Designated Activity Company',
          'Bank of America Europe Designated Activity Company',
          'Íslandsbanki hf.',
          'Landsbankinn hf.',
          'Arion banki hf',
          'Intesa Sanpaolo S.p.A.',
          'Gruppo Bancario Finecobank  ',
          'UniCredit S.p.A.',
          'Gruppo Bancario Mediolanum  ',
          'Credito Emiliano Holding S.p.A.',
          'Banco BPM SpA',
          'Banca Popolare di Sondrio, Società Cooperativa per Azioni',
          'Banca Monte dei Paschi di Siena S.p.A.',
          'CASSA CENTRALE BANCA',
          'ICCREA BANCA S.P.A.',
          'Mediobanca - Banca di Credito Finanziario S.p.A.',
          'Akcine bendrove Šiauliu bankas',
          'Precision Capital S.A.',
          'RBC Investor Services Bank S.A.',
          'J.P. Morgan Bank Luxembourg S.A.',
          'Banque Internationale à Luxembourg',
          'Banque et Caisse d´Epargne de l´Etat, Luxembourg',
          'Akciju sabiedriba "Citadele banka"',
          'MDB Group Limited',
          'Bank of Valletta Plc',
          'HSBC Bank Malta p.l.c.',
          'BNG Bank N.V.',
          'ING Groep N.V.',
          'LP Group B.V.',
          'de Volksbank N.V.',
          'ABN AMRO Bank N.V.',
          'Coöperatieve Rabobank U.A.',
          'Nederlandse Waterschapsbank N.V.',
          'Bank Polska Kasa Opieki S.A.',
          'Powszechna Kasa Oszczednosci Bank Polski S.A.',
          'LSF Nani Investments S.à r.l.',
          'Banco Comercial Português SA',
          'Caixa Geral de Depósitos SA',
          'Banca Transilvania',
          'Länförsäkringar Bank AB (publ)',
          'Kommuninvest - group',
          'Skandinaviska Enskilda Banken - group',
          'SBAB Bank AB - group',
          'Swedbank - group',
          'Svenska Handelsbanken - group',
          'Biser Topco S.à r.l.',
          'Nova Ljubljanska Banka d.d. Ljubljana']

###  User free-form Inputs

In [127]:
# Examples of search strings
s1 = 'Bawag bank' # other options: 'Bawag bank', 'Erste', 'Raiffaisen bank'


### Searching without any text normalization or input normalization

In [128]:
# A naive search method which you need to improve
from difflib import SequenceMatcher
res = []
for token in banks:
  res.append([s1, token, SequenceMatcher(None, s1, token).ratio()])

df2 = pd.DataFrame(res, columns=['Bank 1', 'Bank 2', 'Score'])
# The outcome is not great, for this search query 'BAWAG Group AG' should have highest similarity
df2.sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
8,Bawag bank,Belfius Bank,0.454545
12,Bawag bank,RCB Bank Ltd,0.454545
33,Bawag bank,Bayerische Landesbank,0.451613
42,Bawag bank,Kutxabank S.A,0.434783
99,Bawag bank,BNG Bank N.V.,0.434783


### Similarity Score is very low

In [129]:
 #The desired combination has a low score
idx = df2['Bank 2'].isin(['BAWAG Group AG'])

df2[idx].sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
1,Bawag bank,BAWAG Group AG,0.166667


### Text Preprocessing and Normalization

In [130]:
import re

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

def preprocess_text_list(text_list):
    preprocessed_list = []
    for text in text_list:
        text = preprocess_text(text)
        if 'bank' not in text:
            text += ' bank'
        preprocessed_list.append(text)
    return preprocessed_list



### Bank Names after Preprocessing

In [131]:
preprocessed_banks = preprocess_text_list(banks)
preprocessed_banks

['sberbank europe ag',
 'bawag group ag bank',
 'raiffeisenbankengruppe oö verbund egen',
 'raiffeisen bank international ag',
 'volksbanken verbund',
 'erste group bank ag',
 'kbc groep bank',
 'investeringsmaatschappij argenta bank',
 'belfius bank',
 'axa bank belgium',
 'the bank of new york mellon sanv',
 'first investment bank ad',
 'rcb bank ltd',
 'bank of cyprus holdings public limited company',
 'hellenic bank public company limited',
 'dekabank deutsche girozentrale',
 'erwerbsgesellschaft der sfinanzgruppe mbh  co kg bank',
 'ubs europe se bank',
 'deutsche apotheker und ärztebank eg',
 'volkswagen bank gesellschaft mit beschränkter haftung',
 'münchener hypothekenbank eg',
 'dz bank ag deutsche zentralgenossenschaftsbank frankfurt am main',
 'haspa finanzholding bank',
 'state street europe holdings germany sarl  co kg bank',
 'jp morgan ag bank',
 'deutsche bank aktiengesellschaft',
 'commerzbank aktiengesellschaft',
 'landesbank badenwürttemberg',
 'landesbank hessenthür

Word "bank" is appended in those names where it is not present to provide uniform weight to every name

### Similarity after Text Normalization

In [132]:
s1 = 'Bawag bank'
ip1 = preprocess_text(s1)
res = []
for idx, token in enumerate(preprocessed_banks):
  og_name = banks[idx]
  res.append([s1, og_name, SequenceMatcher(None, ip1, token).ratio()])

df2 = pd.DataFrame(res, columns=['Bank 1', 'Bank 2', 'Score'])
df2.sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
1,Bawag bank,BAWAG Group AG,0.689655
99,Bawag bank,BNG Bank N.V.,0.666667
24,Bawag bank,J.P. Morgan AG,0.592593
56,Bawag bank,Bpifrance,0.583333
6,Bawag bank,KBC Groep,0.583333


In [133]:
s2 = 'Erste'
ip2 = preprocess_text(s2)
res = []
for idx, token in enumerate(preprocessed_banks):
  og_name = banks[idx]
  res.append([s2, og_name, SequenceMatcher(None, ip2, token).ratio()])

df2 = pd.DataFrame(res, columns=['Bank 1', 'Bank 2', 'Score'])
df2.sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
5,Erste,Erste Group Bank AG,0.416667
48,Erste,Bankinter S.A.,0.352941
17,Erste,UBS Europe SE,0.347826
33,Erste,Bayerische Landesbank,0.307692
55,Erste,La Banque Postale,0.296296


In [134]:
s3 = 'Raiffaisen bank'
ip3 = preprocess_text(s3)
res = []
for idx, token in enumerate(preprocessed_banks):
  og_name = banks[idx]
  res.append([s3, og_name, SequenceMatcher(None, ip3, token).ratio()])

df2 = pd.DataFrame(res, columns=['Bank 1', 'Bank 2', 'Score'])
df2.sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
3,Raiffaisen bank,Raiffeisen Bank International AG,0.595745
77,Raiffaisen bank,Arion banki hf,0.551724
37,Raiffaisen bank,Danske Bank A/S,0.551724
56,Raiffaisen bank,Bpifrance,0.551724
53,Raiffaisen bank,RCI Banque,0.533333


#### Similarity is Significantly Improved after Text Normalization and Preprocessing