# Top Repositories From Github

In [167]:
import pandas as pd

## Pick a website and describe your objective
- Browse through different sites and pick on to scrape. Check the "Project Ideas" section for inspiration.
- Identify the information you'd like to scrape from the site. Decide the format of the output CSV file.
- Summarize your project idea and outline your strategy in a Juptyer notebook. Use the "New" button above.

## Use the requests library to download web pages

- Inspect the website's HTML source and identify the right URLs to download.
- Download and save web pages locally using the requests library.
- Create a function to automate downloading for different topics/search queries.

In [174]:
import requests

In [175]:
topic_url = "https://github.com/topics"

In [176]:
response = requests.get(topic_url)

In [177]:
response.status_code

200

In [179]:
len(response.text)

144363

In [181]:
page_contents = response.text 

In [184]:
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark" data-a11y-animated-images="system">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-UXiu4O52iBFkqt6Kx5t+pqHYP2/LWWIw9+l5ia74TWw+xPzpH44BFfAQp7yzCe0XFGZa72Xiqyml6tox1KkUjw==" rel="stylesheet" href="https://github.githubassets.com/assets/light-5178aee0ee76.css" /><link crossorigin="anonymous" media="all" integrity="sha512-IX1PnI5wWBz8Kgb1JI0f2QFa/WuRQQHJHe0vkKinQzsxRlNb4b8NgODX5htSZVAAk

In [183]:
with open ("webpage.html", "w") as f:
    f.write(page_contents)

## Use Beautiful Soup to parse and extract information

- Parse and explore the structure of downloaded web pages using Beautiful soup.
- Use the right properties and methods to extract the required information.
- Create functions to extract from the page into lists and dictionaries.
- (Optional) Use a REST API to acquire additional information if required.

In [185]:
from bs4 import BeautifulSoup

In [189]:
doc = BeautifulSoup(page_contents, "html.parser")

topic title P_Tags

In [217]:
selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
topic_title_tags = doc.find_all("p",  {"class": selection_class})

In [250]:
len(topic_title_tags)

30

Topic Title List 

In [251]:
topic_titles= []

for topic in topic_title_p_tags:
    topic_titles.append(topic.text)

print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


Description tags

In [270]:
decription_tags = "f5 color-fg-muted mb-0 mt-1"  
description_topic_tags = doc.find_all("p",  {"class": decription_tags})



Description List

In [271]:
description = []
for topic in description_topic_tags :
    description.append(topic.text.strip())

print(description[:5])
 

['3D modeling is the process of virtually developing the surface and structure of a 3D object.', 'Ajax is a technique for creating interactive web applications.', 'Algorithms are self-contained sequences that carry out a variety of tasks.', 'Amp is a non-blocking concurrency library for PHP.', 'Android is an operating system built by Google designed for mobile devices.']


In [272]:
topic_title_tags0 = topic_title_p_tags[0]

End Url 

In [273]:
end_url = topic_title_tags0.parent["href"]

url

In [274]:
topic_url ="https://github.com" + end_url
print(topic_url)

https://github.com/topics/3d


In [275]:
topic_link_tags = doc.find_all("a", {"class": "no-underline flex-grow-0"})

 topic_link_tag_list_Url

In [276]:
topic_link_tag_list = []
base_url = "https://github.com"
for topic_link in topic_link_tags:
    topic_link_tag_list.append( base_url + topic_link["href"])

print(topic_link_tag_list)
    
    

['https://github.com/topics/3d', 'https://github.com/topics/ajax', 'https://github.com/topics/algorithm', 'https://github.com/topics/amphp', 'https://github.com/topics/android', 'https://github.com/topics/angular', 'https://github.com/topics/ansible', 'https://github.com/topics/api', 'https://github.com/topics/arduino', 'https://github.com/topics/aspnet', 'https://github.com/topics/atom', 'https://github.com/topics/awesome', 'https://github.com/topics/aws', 'https://github.com/topics/azure', 'https://github.com/topics/babel', 'https://github.com/topics/bash', 'https://github.com/topics/bitcoin', 'https://github.com/topics/bootstrap', 'https://github.com/topics/bot', 'https://github.com/topics/c', 'https://github.com/topics/chrome', 'https://github.com/topics/chrome-extension', 'https://github.com/topics/cli', 'https://github.com/topics/clojure', 'https://github.com/topics/code-quality', 'https://github.com/topics/code-review', 'https://github.com/topics/compiler', 'https://github.com/t

In [280]:
topic_dict = {
    "titles": topic_titles,
    "description": description,
    "link_tags": topic_link_tag_list
}


## Create CSV file(s) with the extracted information

- Create functions for the end-to-end process of downloading, parsing, and saving CSVs.
- Execute the function with different inputs to create a dataset of CSV files.
- Verify the information in the CSV files by reading them back using Pandas.

In [281]:
topics_df = pd.DataFrame(topic_dict)

In [282]:
topics_df.to_csv("topic", index = None)

## getting a description out of a title page

In [283]:
topic_page_url =topic_link_tag_list[0]

In [289]:
topic_page_url

'https://github.com/topics/3d'

In [290]:
response = requests.get(topic_page_url)

In [291]:
response.status_code

200

In [292]:
len(response.text)

649599

In [294]:
topic_doc = BeautifulSoup(response.text, "html.parser")

H3 Headers in the document 

In [301]:
 repo_tags = topic_dot.find_all("h3", {"class":"f3 color-fg-muted text-normal lh-condensed"})


A Tags in doc2 

In [302]:
a_tags = repo_tags[0].find_all("a")

In [303]:
a_tags

[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
             mrdoob
 </a>,
 <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data

Href text

In [308]:
a_tags[0].text.strip()

'mrdoob'

In [265]:
a_tags[1].text.strip()

'three.js'

Topic page Url 

In [309]:
baseurl = "https://github.com"
url = baseurl + a_tags[1]["href"]
print(url)

https://github.com/mrdoob/three.js


# Star Tags

In [313]:
star_tags = topic_doc.find_all("span", {"class":"Counter js-social-count"} )

In [314]:
len(star_tags)

30

In [315]:
star_tags[1].text

'20.3k'

stars to number function

In [135]:
def parser(stars):
    stars = stars.strip()
    if stars[-1] == "k":
        return int(float(stars[:-1]) * 1000)

In [316]:
parser(star_tags[0].text)

84000

In [317]:
def repo_info(h3_tags, stars_tag):
    a_tags = h3_tags.find_all("a")
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = baseurl + a_tags[1]["href"]
    stars = parser(stars_tag.text.strip())
    
    return username, repo_name, stars, repo_url 

    
    

In [322]:
repo_info(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 84000, 'https://github.com/mrdoob/three.js')

In [319]:
list_dictionary = {
    "username": [], 
    "repo_name":[] ,
    "stars": [],
    "repo_url":[], 
    
    
}
for i in range(len(repo_tags)):
    repo = repo_info(repo_tags[i], star_tags[i])
    list_dictionary["username"].append(repo[0])
    list_dictionary["repo_name"].append(repo[1])
    list_dictionary["stars"].append(repo[2])
    list_dictionary["repo_url"].append(repo[3])

    

In [328]:
list_dictionary

{'username': ['mrdoob',
  'libgdx',
  'pmndrs',
  'BabylonJS',
  'aframevr',
  'ssloy',
  'lettier',
  'FreeCAD',
  'metafizzy',
  'CesiumGS',
  'timzhang642',
  'a1studmuffin',
  'isl-org',
  'blender',
  'domlysz',
  'spritejs',
  'openscad',
  'tensorspace-team',
  'jagenjo',
  'YadiraF',
  'google',
  'AaronJackson',
  'ssloy',
  'FyroxEngine',
  'mosra',
  'tengbao',
  'gfxfundamentals',
  'cleardusk',
  'jasonlong',
  'cnr-isti-vclab'],
 'repo_name': ['three.js',
  'libgdx',
  'react-three-fiber',
  'Babylon.js',
  'aframe',
  'tinyrenderer',
  '3d-game-shaders-for-beginners',
  'FreeCAD',
  'zdog',
  'cesium',
  '3D-Machine-Learning',
  'SpaceshipGenerator',
  'Open3D',
  'blender',
  'BlenderGIS',
  'spritejs',
  'openscad',
  'tensorspace',
  'webglstudio.js',
  'PRNet',
  'model-viewer',
  'vrn',
  'tinyraytracer',
  'Fyrox',
  'magnum',
  'vanta',
  'webgl-fundamentals',
  '3DDFA',
  'isometric-contributions',
  'meshlab'],
 'stars': [84000,
  20300,
  18900,
  18000,
  1440

In [327]:
topic_repo_df = pd.DataFrame(list_dictionary)

In [331]:
def repo_data(topic_url):
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception("Error")
    topic_url = BeautifulSoup(response.text, "html.parser")
    repo_tags = topic_dot.find_all("h3", {"class":"f3 color-fg-muted text-normal lh-condensed"})
    star_tags = topic_doc.find_all("span", {"class":"Counter js-social-count"} )
    
    for i in range(len(repo_tags)):
        repo = repo_info(repo_tags[i], star_tags[i])
        list_dictionary["username"].append(repo[0])
        list_dictionary["repo_name"].append(repo[1])
        list_dictionary["stars"].append(repo[2])
        list_dictionary["repo_url"].append(repo[3])
