In [67]:
!pip install beautifulsoup4



In [68]:
from bs4 import BeautifulSoup
import requests

In [69]:
import pandas as pd

In [79]:
def get_topic_titles(soup):
  title_p_tags=soup.find_all("p",{"class":"f3 lh-condensed mb-0 mt-1 Link--primary"})
  topic_titles=[]
  for title in title_p_tags:
    topic_titles.append(title.text) 
  return topic_titles

def get_topic_desc(soup):
  description_p_tags=soup.find_all("p",{"class":"f5 color-text-secondary mb-0 mt-1"})
  topic_description=[]
  for description in description_p_tags:
    description=description.text.strip()
    topic_description.append(description)
  return topic_description

def get_topic_url(soup,base_url):
  a_tags_topic_link=soup.find_all("a",{"class":"d-flex no-underline"})
  topic_urls=[]
  for url in a_tags_topic_link:
    topic_urls.append(base_url+url['href'])
  return topic_urls

In [83]:
def get_topic_data(base_url,page=""):
  response=requests.get(base_url+"/"+page)
  if response.status_code != 200:
    raise Exception(f"an error occured while connecting to: {base_url}")
  soup = BeautifulSoup(response.text,'html.parser')
  topic_dict={
    "title":get_topic_titles(soup),
    "description":get_topic_desc(soup),
    "url":get_topic_url(soup,base_url)
  }
  return pd.DataFrame(topic_dict)

In [84]:
base_url="https://github.com"
get_topic_data(base_url,"topics")

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [85]:
def get_repo_page(url):
  response=requests.get(url)
  if response.status_code != 200:
    raise Exception(f"unable to access:{url}")
  topic_soup=BeautifulSoup(response.text,'html.parser')
  return topic_soup

In [86]:
def convert_stars_to_numbers(stars):
  if stars[-1]=="k":
    return int(float(stars[:-1])*1000)
  return int(float(stars)*1000)

In [87]:
def get_repo_info(h3_tag,stars_tag):
  a_tags=h3_tag.find_all("a")
  username=a_tags[0].text.strip()
  repo_name=a_tags[1].text.strip()
  repo_url=base_url+a_tags[1]['href']
  star=convert_stars_to_numbers(stars_tag.text.strip())
  return username,repo_name,repo_url,star



In [88]:
def get_repo_data_df(topic_soup):
  h3_tags_class="f3 color-text-secondary text-normal lh-condensed"
  stars_tags_class="social-count float-none"
  h3_tags=topic_soup.find_all("h3",{"class":h3_tags_class})
  stars_tag=topic_soup.find_all("a",{"class":stars_tags_class})
  repo_dict={
      "username":[],
      "repo_name":[],
      "repo_url":[],
      "star":[]
  }
  for index in range(len(h3_tags)):
      repo_info=get_repo_info(h3_tags[index],stars_tag[index])
      repo_dict['username'].append(repo_info[0])
      repo_dict['repo_name'].append(repo_info[1])
      repo_dict['repo_url'].append(repo_info[2])
      repo_dict['star'].append(repo_info[3])
  return pd.DataFrame(repo_dict)

In [92]:
get_repo_data_df(get_repo_page(get_topic_data(base_url,"topics").iloc[0][2]))

Unnamed: 0,username,repo_name,repo_url,star
0,mrdoob,three.js,https://github.com/mrdoob/three.js,73200
1,libgdx,libgdx,https://github.com/libgdx/libgdx,18700
2,BabylonJS,Babylon.js,https://github.com/BabylonJS/Babylon.js,14500
3,pmndrs,react-three-fiber,https://github.com/pmndrs/react-three-fiber,14100
4,aframevr,aframe,https://github.com/aframevr/aframe,12900
5,ssloy,tinyrenderer,https://github.com/ssloy/tinyrenderer,11000
6,lettier,3d-game-shaders-for-beginners,https://github.com/lettier/3d-game-shaders-for...,10800
7,FreeCAD,FreeCAD,https://github.com/FreeCAD/FreeCAD,9600
8,metafizzy,zdog,https://github.com/metafizzy/zdog,8600
9,CesiumGS,cesium,https://github.com/CesiumGS/cesium,7300
