### Press shift + Enter to execute the cells

### your csv should contain column name as website

In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [14]:
def add_https(url):
    if not url.startswith('https://') and not url.startswith('http://'):
        return 'https://' + url
    else:
        return url

In [5]:
def payment_methods_available(possible_values):
    payment_methods = []
    for value in possible_values:
        if 'gokwik' in value.lower():
            payment_methods.append('GoKwik')
        if 'simpl' in value.lower():
            payment_methods.append('Simpl')
        if 'zecpe' in value.lower():
            payment_methods.append('Zecpe')
        if 'snapmint' in value.lower():
            payment_methods.append('Snapmint')
        if 'magic-rzp' in value.lower():
            payment_methods.append('Razorpay Magic')
    return list(set(payment_methods))

# Apply the functions to create new columns
#df['extracted_values'] = df.apply(extract_values, axis=1)


#df['payment_method_availabe_possible'] = df['extracted_values'].apply(payment_methods_available)

### The main function

In [18]:
def getting_payment_info(csv_file):

    # List of URLs to inspect
    urls = pd.read_csv(csv_file)
    urls = urls[urls['website'].notnull()]
    urls['website'] = urls['website'].astype('str')
    urls['website'] = urls['website'].apply(add_https)
    urls_to_inspect = urls['website']  # Replace with your actual URLs

    # Initialize an empty DataFrame
    df = pd.DataFrame(columns=['url', 's1', 's2', 'js_set', 'begin_keywords'])

    i = 0
    for url in urls_to_inspect:
        print(i)
        try:
            # Send an HTTP GET request to the URL with a timeout of 10 seconds
            response = requests.get(url, timeout=10)

            if response.status_code == 200:
                # Get the page source
                page_source = response.text

                # Use regular expressions to find all subdomains (without 'cdn')
                subdomains = re.findall(r'(https?://[a-zA-Z0-9.-]+\.[a-z]+)', page_source)
                subdomains_2 = re.findall(r'(https?://[a-zA-Z0-9.-]*cdn[a-zA-Z0-9.-]*\.[a-z]+)', page_source)
                s1 = list(set(subdomains))
                s2 = list(set(subdomains_2))

                # Use BeautifulSoup to parse the HTML content
                soup = BeautifulSoup(page_source, 'html.parser')

                # Find all script tags with the specified pattern
                script_pattern = re.compile(r'<script[^>]*src=[\'"]([^\'"]*\.js)[\'"][^>]*>', re.IGNORECASE)
                js_set = list(set(script_pattern.findall(str(soup))))

                # Use regular expressions to find 'begin' related keywords in HTML comments
                comment_pattern = r'<!--(.*?)-->'
                search_keyword = 'begin'
                exclude_keyword = 'snippet'
                matches = re.finditer(comment_pattern, page_source, re.DOTALL | re.IGNORECASE)
                begin_keywords = [match.group(1) for match in matches if search_keyword.lower() in match.group(1).lower() and exclude_keyword not in match.group(1).lower()]

            else:
                # If the response status code is not 200, set all columns (except 'URL') to lists with 'NA'
                s1 = ['NA']
                s2 = ['NA']
                js_set = ['NA']
                begin_keywords = ['NA']

        except Exception as e:
            print(f"Exception occurred for URL {url}: {e}")
            # Set values to handle the exception
            s1 = ['Exception']
            s2 = ['Exception']
            js_set = ['Exception']
            begin_keywords = ['Exception']

        i += 1

        # Create a dictionary with the data for the current URL
        data = {
            'url': [url],
            's1': [s1],
            's2': [s2],
            'js_set': [js_set],
            'begin_keywords': [begin_keywords],
        }

        # Append the data to the DataFrame
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
        combined_values = df['s1'] + df['s2']+ df['js_set']+ df['begin_keywords']
        urls['payment_method_available_possible'] = combined_values.apply(payment_methods_available)
    
    return urls
        




### below is just an example of how your csv should look ( the only imp thing is that the column name should be 'website'

In [19]:
pd.read_csv('testing_pipeline_merchants.csv')

Unnamed: 0,Name,website
0,Suger candy,sugercandy.com
1,Radha Mohan Enterprises,sircorbett.com
2,PinkWoolf,pinkwoolf.com
3,Shriya Singhi Label,shriyasinghi.com
4,ISU,isufashion.com
...,...,...
3510,Bombay Greens,Www.bombaygreens.com
3511,uncletony,uncletony.com
3512,The Indian Garage,https://tigc.in/
3513,Lipka Home,https://lipkahome.com/


### Here you can put your file name and then it will do the rest. The counter displays : for how many website , we got the data

In [22]:
df = getting_payment_info('yourfilename.csv') # give the csv file name 

0
1
Exception occurred for URL https://sircorbett.com: HTTPSConnectionPool(host='sircorbett.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'sircorbett.com' doesn't match either of '*.myshopify.com', 'myshopify.com'")))
2
3
4
5
6
7
8
9
10
11
12
13
14


### print your dataframe and you save it as well

In [23]:
#print your dataframe/csv file
df

Unnamed: 0,Name,website,payment_method_available_possible
0,Suger candy,https://sugercandy.com,[Zecpe]
1,Radha Mohan Enterprises,https://sircorbett.com,[]
2,PinkWoolf,https://pinkwoolf.com,[Simpl]
3,Shriya Singhi Label,https://shriyasinghi.com,[Simpl]
4,ISU,https://isufashion.com,[]
5,CSC by Jai Ingredients,https://teamcsc.in,[]
6,Santhitham,https://santhitham.shop,[Simpl]
7,BLUE BREW,https://bluebrew.in,[]
8,Dromen and Co,https://dromenco.com,[]
9,OFFO Store,https://offostore.com,[]


In [24]:
#to save your csv file
df.to_csv('file_name.csv')