# Installing and Importing Python Libraries

In [48]:
#!pip install bs4 --quiet

In [49]:
import pandas as pd
import numpy as np 
import requests
from bs4 import BeautifulSoup

# Building the Webscraping Program

In [50]:
url = 'https://github.com/KevsterAmp'
useragent = {'User-agent': 'Mozilla/5.0'}

In [51]:
r = requests.get(url, headers=useragent)
html = BeautifulSoup(r.text)

In [52]:
name = html.select_one("span.p-name.vcard-fullname.d-block.overflow-hidden").text.strip()
name

'Kevin Christian Amparado'

In [53]:
username = html.select_one("span.p-nickname.vcard-username.d-block").text.strip()
username

'KevsterAmp'

In [54]:
bio = html.select("div.user-profile-bio > * ")
bio = (" ").join([tag.text.strip() for tag in bio])
bio

'Head of Data Engineering at @Google-Developer-Student-Clubs-PUP-Main'

In [55]:
contrib = html.select_one("div.position-relative > h2.f4.text-normal.mb-2").text
last = ("").join([word for word in contrib if word.isnumeric()])
last

'424'

In [56]:
tags = html.select("a.Link--secondary.no-underline.no-wrap")
followers = tags[0].select_one("span.text-bold.color-fg-default").text
following = tags[1].select_one("span.text-bold.color-fg-default").text
print("followers", followers)
print("following", following)

followers 16
following 32


# Importing Dataframe of lists of usernames

In [57]:
df = pd.read_csv("test forms.csv")
df.head()

Unnamed: 0,Timestamp,"Full Name (GN, LN)\nex: Kevin Christian Amparado",Input Github Username:\nex: Kevsteramp
0,2023/03/13 6:13:53 PM GMT+8,Alexandre Pornea,alexandrepornea
1,2023/03/13 6:13:57 PM GMT+8,Arjay Rosel,r-json
2,2023/03/13 6:16:27 PM GMT+8,mark,markmcrg


In [58]:
df.drop('Timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0,"Full Name (GN, LN)\nex: Kevin Christian Amparado",Input Github Username:\nex: Kevsteramp
0,Alexandre Pornea,alexandrepornea
1,Arjay Rosel,r-json
2,mark,markmcrg


In [59]:
df.columns = ['name', 'github_username']
df.head()

Unnamed: 0,name,github_username
0,Alexandre Pornea,alexandrepornea
1,Arjay Rosel,r-json
2,mark,markmcrg


In [60]:
df['github_link'] = ['https://github.com/' + username for username in df['github_username']]
df.head()

Unnamed: 0,name,github_username,github_link
0,Alexandre Pornea,alexandrepornea,https://github.com/alexandrepornea
1,Arjay Rosel,r-json,https://github.com/r-json
2,mark,markmcrg,https://github.com/markmcrg


In [61]:
r = requests.get(df['github_link'][0])
r

<Response [200]>

In [62]:
import time
output_dict = {
    'github_link': [],
    'github_name': [],
    'github_username': [],
    'bio': [],
    'contributions': [],
    'followers': [],
    'following': [] 
}

def to_text(tag):
    return tag.text if tag is not None else None

start = time.time()
for url in df['github_link']:
    r = requests.get(url, headers=useragent)
    html = BeautifulSoup(r.text)
    
    print("scraping url:", url)

    name = to_text(html.select_one("span.p-name.vcard-fullname.d-block.overflow-hidden")).strip()
    username = to_text(html.select_one("span.p-nickname.vcard-username.d-block")).strip()
    bio = html.select("div.user-profile-bio > * ")
    clean_bio = (" ").join([tag.text.strip() for tag in bio if tag is not None])
    contrib = to_text(html.select_one("div.position-relative > h2.f4.text-normal.mb-2"))
    clean_contrib = ("").join([word for word in contrib if word.isnumeric()]) if contrib is not None else None
    tags = html.select("a.Link--secondary.no-underline.no-wrap")
    try:
        followers = to_text(tags[0].select_one("span.text-bold.color-fg-default"))
        following = to_text(tags[1].select_one("span.text-bold.color-fg-default"))
    except: 
        followers = ""
        following = ""

    output_dict['github_link'].append(url)
    output_dict['github_name'].append(name)
    output_dict['github_username'].append(username)
    output_dict['bio'].append(clean_bio)
    output_dict['contributions'].append(clean_contrib)
    output_dict['followers'].append(followers)
    output_dict['following'].append(following)

    print(url, "appended to output_dict")
    
end = time.time()
print(str((end - start)/ 60), "elapsed minutes")

scraping url: https://github.com/alexandrepornea
https://github.com/alexandrepornea appended to output_dict
scraping url: https://github.com/r-json
https://github.com/r-json appended to output_dict
scraping url: https://github.com/markmcrg
https://github.com/markmcrg appended to output_dict
0.040669584274291994 elapsed minutes


In [63]:
df2 = pd.DataFrame(output_dict)
df2

Unnamed: 0,github_link,github_name,github_username,bio,contributions,followers,following
0,https://github.com/alexandrepornea,,alexandrepornea,,1,,
1,https://github.com/r-json,Arjay Rosel,r-json,,4,3.0,5.0
2,https://github.com/markmcrg,Mark,markmcrg,Data Engineering Fellow Lead at GDSC - PUP Main,1355,4.0,6.0


In [64]:
out_df = pd.merge(df, df2, on=['github_link', 'github_username'], how='left')
out_df

Unnamed: 0,name,github_username,github_link,github_name,bio,contributions,followers,following
0,Alexandre Pornea,alexandrepornea,https://github.com/alexandrepornea,,,1,,
1,Arjay Rosel,r-json,https://github.com/r-json,Arjay Rosel,,4,3.0,5.0
2,mark,markmcrg,https://github.com/markmcrg,Mark,Data Engineering Fellow Lead at GDSC - PUP Main,1355,4.0,6.0
