# Importing the required packages


In [1]:
import requests
from selenium import webdriver
import bs4
import re
from tqdm import tqdm
import gc
import pandas as pd

In [2]:
# web driver for selenium
driver = webdriver.Chrome(executable_path=r'C:\Webdriver\chromedriver.exe')

# get the web page and store it as a soup element
driver.get('https://www.hdfcbank.com/personal/pay/cards/credit-cards')
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
driver.close()

Required info:

[Card Name, Card fee, Reward points/percentage per 100 spent, Lounge access, Milestone benefit, Card fee reversal condition if any]

In [3]:
# Here we are extracting the card names and the links to the feature pages
card_names = []
links = []
for i in soup.findAll("div",{"class":"cardparent"}):
    
    # From the parent class we need to extract the title/card name 
    s = i.findAll('h2',{'class':'cardTitle'})[0]
    card_names.append(s.get_text('^^',strip=True).split('^^')[0])
    
    # Now we will find the links to the know more section and store them
    for link in i.findAll("a",{'href': re.compile(r'/personal'),'target':'_self'}):
        a = link.get('href')
        if a!=None:
            links.append(a)
            break

# Printing out some of the card names and their linnk
for name, link in zip(card_names[:5], links[:5]):
    print(name,":",link)

MoneyBack+ Credit Card : /personal/pay/cards/credit-cards/moneyback-plus
Millennia Credit Card : /personal/pay/cards/millennia-cards/millennia-cc-new
IndianOil HDFC Bank Credit Card : /personal/pay/cards/credit-cards/indianoil-hdfc-bank-credit-card
Regalia Gold Credit Card : /personal/pay/cards/credit-cards/regalia-gold-credit-card
IRCTC HDFC Bank Credit Card : /personal/pay/cards/credit-cards/irctc-credit-card


In [5]:
# From the above obtained links, creating and storing the complete links to the web page
c_links = [f"https://www.hdfcbank.com{link}" for link in links]
print(*c_links[0:5],sep='\n')

https://www.hdfcbank.com/personal/pay/cards/credit-cards/moneyback-plus
https://www.hdfcbank.com/personal/pay/cards/millennia-cards/millennia-cc-new
https://www.hdfcbank.com/personal/pay/cards/credit-cards/indianoil-hdfc-bank-credit-card
https://www.hdfcbank.com/personal/pay/cards/credit-cards/regalia-gold-credit-card
https://www.hdfcbank.com/personal/pay/cards/credit-cards/irctc-credit-card


In [6]:
# Creating a function that retrieves the content of a webpage given the link
def get_page(link):
    """
    This function return the bs4 format for a given link.
    """
    driver = webdriver.Chrome(executable_path=r'C:\Webdriver\chromedriver.exe')
    driver.get(link)
    soup_res = bs4.BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    
    return soup_res

## Extracting all the other information for each of the credit cards 

In [None]:
# empty lists to store the extracted information
all_features = []
all_points = []
lounge_access = []
milestone_benefit = []
rewards = []
opt_s = []

# Iterating over all the links
for link in c_links:
    
    # get the webpage for the key-features
    key_features = get_page(link)
    
    features = []      # to store the points available in the know more page of each credit card
    fpoints = []       # To store the sub-points corresponding to each point

    # iterate throung each point which are present in the content-body
    for i in key_features.findAll("div",{"class":"content-body"}):
        
        # get the names for the points/features
        s = i.findAll('h4',{'class':'row-name'})  # This return a list of elements
        
        # get the corresponding list of sub-points or paragraph
        points = i.findAll(['li','p'])
        
        tmp_points = []     # This is to store all the subpoints
        for point in points:
            tmp_points.append(point.get_text(strip=True))
        fpoints.append(tmp_points)
        del tmp_points; gc.collect()

        try:
            # extract the features and store it
            features.append(s[0].get_text('^^',strip=True))

        except Exception as e:
            pass
    
    
    # Remove special characters from the feature names
    features = [re.sub(r'[^a-zA-Z0-9 ]+',"",f).lower() for f in features]
    
    # create a dictionary for the different features and the feature points
    data = {f:x for f,x in zip(features, fpoints)}
    
    # Filter the features required to us
    filtered_featues = list(filter(lambda x: re.search('feature\w*|mile\w*|reward\w*|lounge\w*|benefit\w+',x), features))
    
    # Lounge access:
    lounge_key = list(filter(lambda x:True if re.search(r'lounge',x) else False,filtered_featues))
    if len(lounge_key)!=0:
        lounge_access.append(data[lounge_key[0]])
    else:
        lounge_access.append(None)
    
    # Milestone benefit:
    mile_key = list(filter(lambda x:True if re.search(r'mile\w*',x) else False, filtered_featues))
    if len(mile_key)!=0:
        milestone_benefit.append(data[mile_key[0]])
    else:
        milestone_benefit.append(None)
    
    # rewards
    rew_key = list(filter(lambda x:True if re.search(r'reward\w*|key\w*|feature\w*',x) else False,filtered_featues))
    if len(rew_key)==1:
        rewards.append(data[rew_key[0]])
    elif len(rew_key)>1:
        tmp = []
        for rk in rew_key:
            tmp.extend(data[rk])
        rewards.append(tmp)
    else:
        rewards.append(None)
    
    # get the webpage for fees and charges
    fee_ = get_page(link+'/fees-and-charges')
    s = []
    for i in fee_.findAll('div',{'class':'inner-content'}):
        s_ = i.findAll(['li','p'])
        s.extend([y.get_text('^^',strip=True) for y in s_])
    
    # renewal offers also contain details of card fee reversal, so we are going to append that information to fee
    ren_off = list(filter(lambda x:True if re.search(r'renew\w*',x) else False,filtered_featues))
    if len(ren_off)!=0:
        s.extend(ren_off[0])
    opt_s.append(s)

In [66]:
# converting the obtained results to a dictionary and then storing it in a pandas dataframe
xyz = {"card_name":card_names,
    "lounge_access":lounge_access,
        "milestone_benefit":milestone_benefit,
        "rewards":rewards,
       "fees":opt_s}
df = pd.DataFrame.from_dict(xyz)
df = df.drop_duplicates(subset = ['card_name'])
# df.to_csv('all_card_details.csv',index=False)
df.head()

Unnamed: 0,card_name,lounge_access,milestone_benefit,rewards,fees
0,MoneyBack+ Credit Card,,,"[10X CashPoints on Amazon, BigBasket , Flipkar...",[Joining/Renewal Membership Fee – ₹500/- + App...
1,Millennia Credit Card,[8 Complimentary Domestic Lounge Access per ca...,,"[5% Cashback on Amazon, BookMyShow, Cult.fit, ...","[Joining/Renewal Membership Fee – ₹1,000/- + A..."
2,IndianOil HDFC Bank Credit Card,,,"[Fuel Points, , , Earn up to 50 Litres of Free...",[Joining/Renewal Membership Fee – Rs. 500/- pl...
3,Regalia Gold Credit Card,[A. Complimentary Airport Lounge Access within...,"[Get ₹ 1,500 worth vouchers from Marriott, Myn...",[Complimentary Club Vistara Silver Tier and MM...,[Joining/ Renewal Membership Fee of Rs. 2500 +...
4,IRCTC HDFC Bank Credit Card,[8 complimentary access to select IRCTC Execut...,[Gift voucher worth INR 500 on quarterly spend...,"[HDFC Bank Reward Points, 5 Reward Point for e...",[Joining/Renewal Membership Fee – Rs. 500/- + ...


## Extracting information on Card Fees and Card Fee reversal condition

In [63]:
card_fees = []
card_rev_cond = []
for s in df.fees.values:
    s = set(s)
    l = []
    for i in s:
        l.extend(i.split('^^'))
    
    # filter statements based on certain keywords
    reversal_cond = list(filter(lambda x:True if re.search(r'waiv\w*|membership\s*\w*\s*free|spend\w*',x.lower()) else False, l))
    reversal_cond = list(map(lambda x:re.sub(r'(\xa0)*|(\u200b)*',"",x).strip(), reversal_cond))
    
    if len(reversal_cond)!=0:
        card_rev_cond.append(' '.join(reversal_cond))
    else:
        card_rev_cond.append(None)
    
    # Cascade conditions to filter out what we need exactly
    
    # 1. Find statements having digits
    l = list(filter(lambda x:True if re.search(r'\d+',x.lower()) else False, l))
    
    # Find statements that do not contain the keyword waiv\w+ meaning waiver/waived etc
    l = list(filter(lambda x:False if re.search(r'waiv\w+',x.lower()) else True, l))
    
    # Finally find the staments containing certain patterns
    pattern = r'nominal|join\w*|minimum|renewal membership|per annum|monthly membership fee|annual membership fee'
    fees = list(filter(lambda x:True if re.search(pattern,x.lower()) else False, l))
    
    # If we have multiple statements we will join them together
    if len(fees)!=0:
        card_fees.append(' '.join(fees))
    else:
        card_fees.append(None)
    


In [67]:
# # Add the extracted details to the dataframe
df['card_fees'] = card_fees
df['card_rev_cond'] = card_rev_cond
df.head()

Unnamed: 0,card_name,lounge_access,milestone_benefit,rewards,fees
0,MoneyBack+ Credit Card,,,"[10X CashPoints on Amazon, BigBasket , Flipkar...",[Joining/Renewal Membership Fee – ₹500/- + App...
1,Millennia Credit Card,[8 Complimentary Domestic Lounge Access per ca...,,"[5% Cashback on Amazon, BookMyShow, Cult.fit, ...","[Joining/Renewal Membership Fee – ₹1,000/- + A..."
2,IndianOil HDFC Bank Credit Card,,,"[Fuel Points, , , Earn up to 50 Litres of Free...",[Joining/Renewal Membership Fee – Rs. 500/- pl...
3,Regalia Gold Credit Card,[A. Complimentary Airport Lounge Access within...,"[Get ₹ 1,500 worth vouchers from Marriott, Myn...",[Complimentary Club Vistara Silver Tier and MM...,[Joining/ Renewal Membership Fee of Rs. 2500 +...
4,IRCTC HDFC Bank Credit Card,[8 complimentary access to select IRCTC Execut...,[Gift voucher worth INR 500 on quarterly spend...,"[HDFC Bank Reward Points, 5 Reward Point for e...",[Joining/Renewal Membership Fee – Rs. 500/- + ...


Required info:
[Card Name, Card fee, Reward points/percentage per 100 spent, Lounge access, Milestone benefit, Card fee reversal condition if any]

## Further extraction/filtering of information on rewards 

In [62]:
fil_rew_val = []
for rew in  df.rewards.values:
#     print('raw : ',rew)
    if rew != None:
        
        # Filter out the statements having digits in them
        l = list(filter(lambda x:True if re.search(r'\d+',x.lower()) else False, rew))
        l = set(l)
        
        # Filter our statements having specific keywords
        l = list(filter(lambda x:False if re.search(r'(rewardpoint|rp) =',x.lower()) else True, rew))
        l = list(filter(lambda x:True if re.search(r'rewards|point|reward point\w*|cashpoints',x.lower()) else False, l))
        
        # removing extra unnecessary characters
        l = list(map(lambda x:re.sub(r'(\xa0)*|(\u200b)*',"",x).strip(), l))
        
        # Store the filtered list
        fil_rew_val.append(l)
        print('='*100)
    else:
        fil_rew_val.append(None)

df.rewards = fil_rew_val
df.head()

raw :  {'For redemption against statement balance, Cardholder must have minimum CashPoints equivalent to ₹500', 'Zero lost Card liability :In the unfortunate event of losing your HDFC Bank Moneyback+ Credit Card, report it immediately to our 24-hour call centre. On reporting the loss immediately, you have zero liability on any fraudulent transactions made on your Credit Card.', 'The CashPoints earned on HDFC Bank MoneyBack+ Credit Card can be redeemed against the statement balance at the rate of 1 CashPoint = ₹0.25, and can be done via Net Banking login, Phone Banking, or physical redemption form', 'With effect from 1st January 2023, Reward points redemption for flights & hotels bookings are capped per calendar month at 50,000.', 'With effect from 1st February 2023, Reward points redemption for CashBack redemption are capped per calendar month to 50,000 rewards points.', 'Upto 20% discount on partner restaurants via Swiggy Dineout', 'With effect from 1st January 2023, Rent payments and

Unnamed: 0,card_name,lounge_access,milestone_benefit,rewards,fees,card_fees,card_rev_cond
0,MoneyBack+ Credit Card,,,"[10X CashPoints on Amazon, BigBasket , Flipkar...",[Joining/Renewal Membership Fee – ₹500/- + App...,Joining/Renewal Membership Fee – ₹500/- + Appl...,"Spend ₹50,000 or more in a year, before your C..."
1,Millennia Credit Card,[8 Complimentary Domestic Lounge Access per ca...,,"[With effect from 1st January 2023, Rent payme...","[Joining/Renewal Membership Fee – ₹1,000/- + A...","Joining/Renewal Membership Fee – ₹1,000/- + Ap...","Spend ₹1,00,000 or more in a year, before your..."
2,IndianOil HDFC Bank Credit Card,,,"[Fuel Points, Earn 5% of your spends as Fuel P...",[Joining/Renewal Membership Fee – Rs. 500/- pl...,Joining/Renewal Membership Fee – Rs. 500/- plu...,
3,Regalia Gold Credit Card,[A. Complimentary Airport Lounge Access within...,"[Get ₹ 1,500 worth vouchers from Marriott, Myn...",[5X Reward Points on spends at Marks & Spencer...,[Joining/ Renewal Membership Fee of Rs. 2500 +...,Joining/ Renewal Membership Fee of Rs. 2500 + ...,
4,IRCTC HDFC Bank Credit Card,[8 complimentary access to select IRCTC Execut...,[Gift voucher worth INR 500 on quarterly spend...,"[HDFC Bank Reward Points, 5 Reward Point for e...",[Joining/Renewal Membership Fee – Rs. 500/- + ...,Joining/Renewal Membership Fee – Rs. 500/- + A...,"Spend Rs. 1,50,000 or more in a year, before y..."
5,Tata Neu Plus HDFC Bank Credit Card,[Domestic Lounge Access4 Complimentary Domesti...,,[1. Rental and Government related transactions...,[Joining/Renewal Membership Fee – ₹499/- + App...,Joining/Renewal Membership Fee – ₹499/- + Appl...,"Spend ₹1,00,000 or more in a year, before your..."
6,Tata Neu Infinity HDFC Bank Credit Card,[A. Domestic Lounge Access – VISA / RuPay Card...,,[1. Rental and Government related transactions...,"[Joining/Renewal Membership Fee – ₹1,499/- + A...","Joining/Renewal Membership Fee – ₹1,499/- + Ap...","Spend ₹3,00,000 or more in a year, before your..."
7,HDFC Bank UPI RuPay Credit Card,,,"[3% Cashpoints on Groceries, SuperMarket & Din...",[Joining/Renewal Membership Fee – ₹250/- + App...,Joining/Renewal Membership Fee – ₹250/- + Appl...,"Spend ₹25,000 or more in an annual year, befor..."
8,INFINIA Metal Edition,[Enjoy unlimited complimentary lounge access a...,,"[5 Reward Points for every ₹ 150 spent, Reward...","[INFINIA Metal Edition, Joining/Renewal Member...","Joining/Renewal Membership Fee – Rs. 12,500 + ...",Spend 8 lakhs or more in the preceding 12 mont...
9,Diners Club Black Credit Card,[Unlimited Airport lounge access to 1000+ Loun...,[HDFC Bank Diners Club Black Credit Card holde...,"[Reward Point, 5 Reward Points for every Rs 15...","[Joining/Renewal Membership Fee – Rs. 10,000/-...","Joining/Renewal Membership Fee – Rs. 10,000/- ...",Spend Rs. 5 Lakhs in 12 Months and get Renewal...


In [None]:
# Drop any duplicate rows if any
df = df.drop_duplicates(subset=['card_name'])
df.to_csv('all_card_details.csv',index=False)
df.head()

In [62]:
df = df.drop_duplicates(subset=['card_name'])
df.head()

Unnamed: 0,card_name,lounge_access,milestone_benefit,rewards,fees
0,MoneyBack+ Credit Card,,,"[10X CashPoints on Amazon, BigBasket , Flipkar...",[Joining/Renewal Membership Fee – ₹500/- + App...
1,Millennia Credit Card,[8 Complimentary Domestic Lounge Access per ca...,,"[5% Cashback on Amazon, BookMyShow, Cult.fit, ...","[Joining/Renewal Membership Fee – ₹1,000/- + A..."
2,IndianOil HDFC Bank Credit Card,,,"[Fuel Points, , , Earn up to 50 Litres of Free...",[Joining/Renewal Membership Fee – Rs. 500/- pl...
3,Regalia Gold Credit Card,[A. Complimentary Airport Lounge Access within...,"[Get ₹ 1,500 worth vouchers from Marriott, Myn...",[Complimentary Club Vistara Silver Tier and MM...,[Joining/ Renewal Membership Fee of Rs. 2500 +...
4,IRCTC HDFC Bank Credit Card,[8 complimentary access to select IRCTC Execut...,[Gift voucher worth INR 500 on quarterly spend...,"[HDFC Bank Reward Points, 5 Reward Point for e...",[Joining/Renewal Membership Fee – Rs. 500/- + ...
