In [3]:
!pip install spacy



In [4]:
!python -m spacy download en_core_web_md  # Or en_core_web_lg for a larger model

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy
import pandas as pd

Similarity scores are based on the model's understanding of the text's semantic meaning, so the effectiveness can vary based on the model used (en_core_web_md, en_core_web_lg, etc.) and the specific data.

In [6]:
# Load the medium English model
nlp = spacy.load("en_core_web_md")

A popular NLP library for Python is spaCy, which provides powerful tools for text processing and similarity comparison.

1. Normalize the text by converting it to lowercase, removing punctuation and possibly using lemmatization to reduce words to their base or root form.
2. Convert bank names and queries into vectors. spaCy can convert text into numerical vectors that represent their semantic meaning.
3. Use spaCy's built-in similarity comparison to find the most similar bank names to the user's query.


The problem is to match the user's free-form input against a pre-determined list of banks. For example, user input 'bawag bank' should be matched to 'BAWAG Group AG'.

In [7]:
# List of banks to compare
banks =   ['Sberbank Europe AG',
          'BAWAG Group AG',
          'Raiffeisenbankengruppe OÖ Verbund eGen',
          'Raiffeisen Bank International AG',
          'Volksbanken Verbund',
          'Erste Group Bank AG',
          'KBC Groep',
          'Investeringsmaatschappij Argenta',
          'Belfius Bank',
          'AXA Bank Belgium',
          'The Bank of New York Mellon SA/NV',
          'First Investment Bank AD',
          'RCB Bank Ltd',
          'Bank of Cyprus Holdings Public Limited Company',
          'Hellenic Bank Public Company Limited',
          'DekaBank Deutsche Girozentrale',
          'Erwerbsgesellschaft der S-Finanzgruppe mbH & Co. KG',
          'UBS Europe SE',
          'DEUTSCHE APOTHEKER- UND ÄRZTEBANK EG',
          'Volkswagen Bank Gesellschaft mit beschränkter Haftung',
          'Münchener Hypothekenbank eG',
          'DZ BANK AG Deutsche Zentral-Genossenschaftsbank, Frankfurt am Main',
          'HASPA Finanzholding',
          'State Street Europe Holdings Germany S.a.r.l. & Co. KG',
          'J.P. Morgan AG',
          'DEUTSCHE BANK AKTIENGESELLSCHAFT',
          'COMMERZBANK Aktiengesellschaft',
          'Landesbank Baden-Württemberg',
          'Landesbank Hessen-Thüringen Girozentrale',
          'Norddeutsche Landesbank - Girozentrale -',
          'Deutsche Pfandbriefbank AG',
          'Aareal Bank AG',
          'Hamburg Commercial Bank AG',
          'Bayerische Landesbank',
          'Jyske Bank A/S',
          'Sydbank A/S',
          'Nykredit Realkredit A/S',
          'Danske Bank A/S',
          'Luminor Holding AS',
          'Abanca Corporacion Bancaria S.A.',
          'Banco Santander S.A.',
          'Ibercaja Banco S.A.',
          'Kutxabank S.A',
          'Unicaja Banco S.A.',
          'CaixaBank S.A.',
          'Banco de Crédito Social Cooperativo',
          'Banco Bilbao Vizcaya Argentaria S.A.',
          'Banco de Sabadell S.A.',
          'Bankinter S.A.',
          'Kuntarahoitus Oyj',
          'Nordea Bank Abp',
          'OP Osuuskunta',
          'SFIL',
          'RCI Banque',
          'Confédération Nationale du Crédit Mutuel',
          'La Banque Postale',
          'Bpifrance',
          "C.R.H. - Caisse de refinancement de l'habitat",
          'HSBC Continental Europe',
          'Groupe BPCE',
          'Groupe Crédit Agricole',
          'Société générale',
          'BNP Paribas',
          'ALPHA SERVICES AND HOLDINGS S.A.',
          'National Bank of Greece S.A.',
          'Eurobank Ergasias Services and Holdings S.A.',
          'Piraeus Financial Holdings',
          'OTP-csoport',
          'Magyar Bankholding',
          'Barclays Bank Ireland plc',
          'Citibank Holdings Ireland Limited',
          'AIB Group plc',
          'Bank of Ireland Group plc',
          'Ulster Bank Ireland Designated Activity Company',
          'Bank of America Europe Designated Activity Company',
          'Íslandsbanki hf.',
          'Landsbankinn hf.',
          'Arion banki hf',
          'Intesa Sanpaolo S.p.A.',
          'Gruppo Bancario Finecobank  ',
          'UniCredit S.p.A.',
          'Gruppo Bancario Mediolanum  ',
          'Credito Emiliano Holding S.p.A.',
          'Banco BPM SpA',
          'Banca Popolare di Sondrio, Società Cooperativa per Azioni',
          'Banca Monte dei Paschi di Siena S.p.A.',
          'CASSA CENTRALE BANCA',
          'ICCREA BANCA S.P.A.',
          'Mediobanca - Banca di Credito Finanziario S.p.A.',
          'Akcine bendrove Šiauliu bankas',
          'Precision Capital S.A.',
          'RBC Investor Services Bank S.A.',
          'J.P. Morgan Bank Luxembourg S.A.',
          'Banque Internationale à Luxembourg',
          'Banque et Caisse d´Epargne de l´Etat, Luxembourg',
          'Akciju sabiedriba "Citadele banka"',
          'MDB Group Limited',
          'Bank of Valletta Plc',
          'HSBC Bank Malta p.l.c.',
          'BNG Bank N.V.',
          'ING Groep N.V.',
          'LP Group B.V.',
          'de Volksbank N.V.',
          'ABN AMRO Bank N.V.',
          'Coöperatieve Rabobank U.A.',
          'Nederlandse Waterschapsbank N.V.',
          'Bank Polska Kasa Opieki S.A.',
          'Powszechna Kasa Oszczednosci Bank Polski S.A.',
          'LSF Nani Investments S.à r.l.',
          'Banco Comercial Português SA',
          'Caixa Geral de Depósitos SA',
          'Banca Transilvania',
          'Länförsäkringar Bank AB (publ)',
          'Kommuninvest - group',
          'Skandinaviska Enskilda Banken - group',
          'SBAB Bank AB - group',
          'Swedbank - group',
          'Svenska Handelsbanken - group',
          'Biser Topco S.à r.l.',
          'Nova Ljubljanska Banka d.d. Ljubljana']

In [8]:
# Search query
s1 = "Bawag bank"

# Convert query to lowercase and into a spacy token

In [9]:
# Convert the search query and bank names into spacy tokens
query_token = nlp(s1.lower())

# Convert bank name to lowercase and into a spacy token and calculate Similarity

In [10]:
# Function to calculate similarity
def calculate_similarity(query, name):
    token_of_bank = nlp(name.lower())
    return query.similarity(token_of_bank)

# Calculate similarity scores and include the search query in the results

In [11]:
results = [(bank, calculate_similarity(query_token, bank), s1) for bank in banks]

  return query.similarity(token_of_bank)


# Convert results to a DataFrame, sort by similarity score, and include the search query

In [12]:
df_results = pd.DataFrame(results, columns=['Bank Name', 'Similarity Score', 'Search Query'])
df_sorted = df_results.sort_values(by='Similarity Score', ascending=False)

# Display the top matches

In [13]:
print(df_sorted.head(6))

                           Bank Name  Similarity Score Search Query
8                       Belfius Bank          1.000000   Bawag bank
50                   Nordea Bank Abp          1.000000   Bawag bank
25  DEUTSCHE BANK AKTIENGESELLSCHAFT          0.904751   Bawag bank
12                      RCB Bank Ltd          0.857416   Bawag bank
9                   AXA Bank Belgium          0.814901   Bawag bank
99                     BNG Bank N.V.          0.785333   Bawag bank
