## GreenCatcher Crawler - Fill Excel at Runtime - 15 First Level Links Exploration

In [None]:
import sys, os, re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import datetime
from tqdm import tqdm
# CHANGE THIS WITH THE PATH WHERE YOU KEEP YOUR wayback_machine_GreenCatcher.py DOCUMENT, DO NOT INSERT THE NAME OF THE DOCUMENT
sys.path.append(r'.\measuring-founding-strategy-main\crawler') 
from wayback_machine_GreenCatcher import wayback_machine_GreenCatcher

In [None]:
from langdetect import detect

def detect_language(corpus):
    try:
        text = ' '.join(content for content in corpus)
        language_code = detect(text)
        return language_code
    except:
        return "Unknown"

In [None]:
# if the wayback machine was first set up in 2001, no snapshot should date before 2000. 
# So I set it as the date from which I start searching for the earliest snapshot of a website. 

# set sleeping time in case of extraction error
sleep_time = 10

# set the first year to start looking for websites from, plus the month and day from which to crawl for the closest snapshot. 
unprec_year, crawl_month, crawl_day = 2000, 6, 30

# CHANGE THIS WITH THE PATH WHERE YOU KEEP YOUR "FIRMS TO SCRAPE" EXCEL FILE, INSERT THE NAME OF THE EXCEL FILE AS WELL
df = pd.read_excel(r'.\Firms to Scrape.xlsx')

# partition the companies to be scraped so that every three the an excel file with the scraped text is materialized. 
n = 3

for i in range(df.shape[0]//n + 1):

    # define the subset of firms which we will scrape in this iteration
    start_row = 0 + n*i
    end_row = min(n + n*i, df.shape[0])
    operative_df = df[start_row:end_row]

    # initialize the output dataframe with the columns it will be made from. 
    output_df = pd.DataFrame(columns=['UID', 'Company', 'YearFounded', 'Website', 'ClosestSnapshot', 
        'ClosestSnapshotYear', 'TimeStamp', 'Year', 'HomepageText', 'statuscode', 'HTML Language', 'Recognized Language', 'IsValidWebsite'])

    for row in tqdm(operative_df.index):

        params_got = False
        while (params_got == False):
            try:
                url, founding_year = operative_df.at[row, 'Website'], operative_df.at[row, 'YearFounded']

                crawler = wayback_machine_GreenCatcher(url, year_folder = True)

                snapshots = crawler.list_snapshots(2000, 1, 1, 2024, 12, 31)
                snapshot_timestamps = snapshots['timestamp'].values
                earliest_snapshot_year = int(str(min(snapshot_timestamps))[0:4])
                latest_snapshot_year = int(str(max(snapshot_timestamps))[0:4])
                params_got = True
            except Exception as e:
                    print('Get Params Failed: {0}'.format(e))
                    time.sleep(sleep_time)

        for crawl_year in range(max(earliest_snapshot_year, founding_year), latest_snapshot_year + 1):
            year_got = False
            while (year_got == False):
                try:
                    sought_date = int(datetime.date(year = crawl_year, month = crawl_month, day = crawl_day).strftime("%Y%m%d") + '000000')
                    snapshots['closeness']=np.abs(snapshots['timestamp']-sought_date)
                    min_index = snapshots['closeness'].idxmin()
                    closest_snapshot_timestamp = snapshots.loc[min_index, 'timestamp']
                    closest_snapshot_year = int(str(closest_snapshot_timestamp)[0:4])
                    closest_status = snapshots.loc[min_index, 'statuscode']

                    temp_result = crawler.crawl_from_date(crawl_year, crawl_month, crawl_day, levels=1, counter_threshold=15)
                    homepage_language, corpus = temp_result[0], temp_result[1]

                    line_dict = {
                        'UID' : operative_df.at[row, 'UID'],
                        'Company' : operative_df.at[row, 'Company'],
                        'YearFounded' : operative_df.at[row, 'YearFounded'],
                        'Website' : operative_df.at[row, 'Website'],
                        'ClosestSnapshot' : min(snapshot_timestamps), # ClosestSnapshot in excel is earliest snapshot of wayback machine. 
                        'ClosestSnapshotYear' : earliest_snapshot_year,
                        'TimeStamp' : datetime.datetime.strptime(
                        str(closest_snapshot_timestamp), "%Y%m%d%H%M%S" # TimeStamp is that of the closest snapshot to the currently scraped year. 
                        ).strftime("%Y-%m-%d %H:%M:%S"),
                        'Year' : closest_snapshot_year,
                        'HomepageText' : corpus,
                        'statuscode': closest_status,
                        'HTML Language' : homepage_language,
                        'Recognized Language' : detect_language(corpus),
                        'IsValidWebsite' : crawler.is_valid_url(url)}

                    line_df = pd.DataFrame([line_dict.values()], columns=line_dict.keys())
                    output_df = pd.concat([output_df, line_df], ignore_index=True)
                    year_got = True
                except Exception as e:
                    print('Get Year Failed: {0}'.format(e))
                    time.sleep(sleep_time)
    
    output_df.to_excel(f'out_summary_{i}.xlsx', index=False, engine='xlsxwriter')

print('Iterations Over')