In [70]:
import pandas as pd
import numpy as np
import pprint

In [27]:
path = '/Users/britt/repos/website_bodytext_scraper/website_bodytext_scraper/dev_exports/analysis_input.csv'
df = pd.read_csv(path)[['url', 'donation_page', 'stripe_code_detected']].replace({'Yes': True, 'No': False})

In [74]:
def calculate_metrics(df):
    ''' 
    Produces metrics for scraper results. DF must contain:
        - url: No nulls
        - donation_page: Str OR None
        - stripe_code_detected: Bool OR None
    
    '''
    total = df.shape[0]
    donation_page_found = df[df['donation_page'].notna()]['donation_page'].count()
    stripe_detected = df[df['stripe_code_detected'] == True]['stripe_code_detected'].count()

    perc_donation_of_total = donation_page_found / total * 100
    perc_strip_of_total = stripe_detected / total * 100
    perc_stripe_of_donation =  stripe_detected / donation_page_found * 100

    pprint.pp({
    'Percentage of URLs containing donation pages': round(perc_donation_of_total),
    'Percentage of URLs containing Stripe code': round(perc_strip_of_total),
    'Percentage of donation pages containing Stripe code': round(perc_stripe_of_donation)
    })

In [75]:
calculate_metrics(df)

{'Percentage of URLs containing donation pages': 50,
 'Percentage of URLs containing Stripe code': 10,
 'Percentage of donation pages containing Stripe code': 19}


In [78]:
df[df['donation_page'].notna()]['donation_page'].to_list()

['https://www.abcap.net/donate.html',
 'https://abcinc.org/donate/',
 'https://www.paypal.com/paypalme/LifeCenterVenangoCo',
 'https://www.abetterinternet.org/donate/',
 'https://secure.qgiv.com/for/4adfu8',
 'https://able-inc.org/donate/',
 'https://secure.everyaction.com/7NqPPSxbgkWuJaBjNSV8PQ2',
 'https://aboutcare.org/donations/',
 'https://acapnj.org/give/',
 'https://www.paypal.com/donate/?hosted_button_id=D466PX9JA63DW',
 'https://acc-ef.org/foundation/donate/donate-now.html',
 'https://www.access-psychology.org/donate/',
 'https://www.paypal.com/donate?hosted_button_id=ZEZYGDYKVQ64G',
 'https://secure.anedot.com/michigan-center-of-accountability-for-republicans/donate',
 'https://secure.lglforms.com/form_engine/s/dSiQp1WQBE6HaviNjjVzMg',
 'https://donorbox.org/american-council-of-engineering-companies-of-arizona-pac?utm_medium=qrcode&utm_source=qrcode',
 'https://secure.anedot.com/michigan-center-of-accountability-for-republicans/donate',
 'https://aciint.org/giving/online-givi

Scraper run

In [136]:
scraper_path = '/Users/britt/repos/website_bodytext_scraper/website_bodytext_scraper/dev_exports/stripe_2024-10-03T19-23-18+00-00.csv'
df_scrape = pd.read_csv(scraper_path)

In [137]:
df_scrape.columns

Index(['url', 'success', 'stripe_detected', 'stripe_code', 'error_code'], dtype='object')

In [138]:
df_scrape.shape[0]

21

In [139]:
df_scrape[df_scrape['success'] == True]['success'].count()

17

In [140]:
df_scrape[df_scrape['stripe_detected'] == True]['stripe_detected'].count()

1

In [141]:
df_scrape[df_scrape['stripe_detected'] == True]['url']

1    https://abcinc.org/donate/
Name: url, dtype: object

In [132]:
def calculate_scrape_metrics(df_scrape):
    ''' 
    Produces metrics for scraper results. DF must contain:
        - url: No nulls
        - donation_page: Str OR None
        - stripe_code_detected: Bool OR None
    
    '''
    total = df_scrape.shape[0]
    success = df_scrape[df_scrape['success'] == True]['success'].count()
    stripe_detected = df_scrape[df_scrape['stripe_detected'] == True]['stripe_detected'].count()

    # Calculate percentages
    perc_donation_of_total = success / total * 100
    perc_stripe_of_total = stripe_detected / total * 100
    perc_stripe_of_donation =  stripe_detected / success * 100

    # Summarize errors
    error_codes = df_scrape['error_code'].unique().tolist()
    error_obj = {}
    for code in error_codes:
        urls = df_scrape[df_scrape['error_code'] == code]['url'].to_list()
        error_obj[code] = urls

    pprint.pp({
        'Percentage of donation pages found': round(perc_donation_of_total),
        'Percentage of all attempted donation pages containing Stripe code': round(perc_stripe_of_total),
        'Percentage of found donation pages containing Stripe code': round(perc_stripe_of_donation),
        'Errors': error_obj
    })

calculate_scrape_metrics(df_scrape)

{'Percentage of donation pages found': 81,
 'Percentage of all attempted donation pages containing Stripe code': 5,
 'Percentage of found donation pages containing Stripe code': 6,
 'Errors': {nan: [],
            '<twisted.python.failure.Failure scrapy.spidermiddlewares.httperror.HttpError: Ignoring non-200 response>': ['https://secure.anedot.com/michigan-center-of-accountability-for-republicans/donate',
                                                                                                                         'https://acpmp.org/get-involved/donate/'],
            '<twisted.python.failure.Failure scrapy.exceptions.IgnoreRequest: Forbidden by robots.txt>': ['https://www.paypal.com/donate/?cmd=_s-xclick&hosted_button_id=LLYGMECLBHDYE&source=url&ssrt=1727721996964'],
            '<twisted.python.failure.Failure twisted.internet.error.TCPTimedOutError: TCP connection timed out: 60: Operation timed out.>': ['https://www.abgf.org/donate']}}


Attempt #2: Adding user agent

In [133]:
att_2 = '/Users/britt/repos/website_bodytext_scraper/website_bodytext_scraper/dev_exports/stripe_2024-10-08T18-35-30+00-00.csv'
df_scrape_2 = pd.read_csv(att_2)

In [134]:
calculate_scrape_metrics(df_scrape_2)

{'Percentage of donation pages found': 81,
 'Percentage of all attempted donation pages containing Stripe code': 0,
 'Percentage of found donation pages containing Stripe code': 0,
 'Errors': {nan: [],
            '<twisted.python.failure.Failure scrapy.spidermiddlewares.httperror.HttpError: Ignoring non-200 response>': ['https://secure.anedot.com/michigan-center-of-accountability-for-republicans/donate',
                                                                                                                         'https://acpmp.org/get-involved/donate/'],
            '<twisted.python.failure.Failure scrapy.exceptions.IgnoreRequest: Forbidden by robots.txt>': ['https://www.paypal.com/donate/?cmd=_s-xclick&hosted_button_id=LLYGMECLBHDYE&source=url&ssrt=1727721996964'],
            '<twisted.python.failure.Failure twisted.internet.error.TCPTimedOutError: TCP connection timed out: 60: Operation timed out.>': ['https://www.abgf.org/donate']}}
