# Data Crawling

In [14]:
import pandas as pd
import matplotlib.pyplot as plt

## Let's learn some basic *HTML*!!

...

## Web Crawling With `BeautifulSoup4`

In [40]:
import requests
html = requests.get('https://stackoverflow.com/questions?tab=newest&pagesize=50&page=1')
html.text[:3000]        # A gigantic messy string. Extremely difficult to interact with.

'<!doctype html>\n<html lang="fa-IR" dir="rtl">\n<head>\n  <title data-react-helmet="true">دیوار: بزرگترین سایت نیازمندی های رایگان در ایران</title>\n  <meta data-react-helmet="true" name="viewport" content="width=992, initial-scale=1"/><meta data-react-helmet="true" name="description" content="دیوار مرجع اصلی نیازمندی های رایگان و خرید و فروش کالای نو و دست دوم. نیازمندی های املاک، خودرو، استخدام، لوازم خانه، خدمات و سایر بخش های مورد نظر شما"/><meta data-react-helmet="true" name="twitter:card" content="summary"/><meta data-react-helmet="true" name="twitter:site" content="@divar_official"/><meta data-react-helmet="true" name="twitter:url" content="https://divar.ir"/><meta data-react-helmet="true" name="twitter:title" content="دیوار: بزرگترین سایت نیازمندی های رایگان در ایران"/><meta data-react-helmet="true" name="twitter:description" content="دیوار مرجع اصلی نیازمندی های رایگان و خرید و فروش کالای نو و دست دوم. نیازمندی های املاک، خودرو، استخدام، لوازم خانه، خدمات و سایر بخش های مورد 

In [16]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html.text)

## Finding Elements

In [17]:
all_links = soup.find_all('a')
print(all_links[5].text)


Stack Overflow
Public questions & answers



In [18]:
all_question_boxes = soup.find_all('div', attrs={'class': 's-post-summary'})
print(f'Number of question boxes in page: {len(all_question_boxes)}')
print('-------------------------------------------------------------')
print(all_question_boxes[1])

Number of question boxes in page: 50
-------------------------------------------------------------
<div class="s-post-summary js-post-summary" data-post-id="73315667" data-post-type-id="1" id="question-summary-73315667">
<div class="s-post-summary--stats js-post-summary-stats">
<div class="s-post-summary--stats-item s-post-summary--stats-item__emphasized" title="Score of 0">
<span class="s-post-summary--stats-item-number">0</span>
<span class="s-post-summary--stats-item-unit">votes</span>
</div>
<div class="s-post-summary--stats-item" title="0 answers">
<span class="s-post-summary--stats-item-number">0</span>
<span class="s-post-summary--stats-item-unit">answers</span>
</div>
<div class="s-post-summary--stats-item" title="2 views">
<span class="s-post-summary--stats-item-number">2</span>
<span class="s-post-summary--stats-item-unit">views</span>
</div>
</div>
<div class="s-post-summary--content">
<h3 class="s-post-summary--content-title">
<a class="s-link" href="/questions/73315667/how

**Extracting Question Titles**

In [19]:
all_question_title_elements = soup.select('.s-post-summary--content-title > a')     # Notice how we accessed specific tags using CSS selectors
all_question_titles = [element.text for element in all_question_title_elements]
all_question_titles[:10]

['How to check ! $ & present in password variable in Shell script',
 'How to implement a std::function with operator= that can check if its rhs has same signature',
 'Promise to return instead console.log',
 'FileSystemWatcher - The directory name does not exist',
 'How to identify that accumulated metadata is a problem in Spark?',
 "league/flysystem can't be installled",
 'Hackerrank Euler project #2 Solution in a different way. Receiving segmentation fault for test case 2 and 3',
 'Capturing hyperlinks conditionally in regex captures too much',
 'starts_with in presto?',
 'Error installing Ruby 3.1.2 on macOS 12.5']

**Extracting Question Excerpts**

In [20]:
all_question_excerpt_elements = soup.find_all('div', 's-post-summary--content-excerpt')
all_question_excerpts = [element.text for element in all_question_excerpt_elements]
all_question_excerpts[:5]

['\r\n                I am writing shell script which will validate entered password which should not accept ! $ & sign in password. I need to throw error messages. Kindly help me here.\nHere problem occurring when I ...\r\n            ',
 "\r\n                I'm learning to implement std::function and have found several articles about this. Unfortunately, none of their implementations can report a mistake when I assign a funtor with different signature. ...\r\n            ",
 "\r\n                I'm trying to do a Discord bot. I want to return the console.log content in a variable to return that as a message.\nI mean, instead of .then(console.log, console.error); assign the console.log content ...\r\n            ",
 '\r\n                I have a FileSystemWatcher hosted as a windows service, works fine when pointed to a local folder. but if I point to a network folder in the same server\nfileWatcher.Path = "\\\\uskansclapd01\\\\Input&...\r\n            ',
 '\r\n                I saw

**Extracting Username**

In [21]:
all_question_username_elements = soup.select('.s-user-card--info > .s-user-card--link > a')
all_question_usernames = [element.text for element in all_question_username_elements]
all_question_usernames[:10]

['Dhananjaya D N',
 'SynchronicK',
 'Lsyk4',
 'Kurubaran',
 'Davisson Paulino',
 'Ahmed Zidan',
 '776 Shivamrut',
 'Lewis Marhin',
 'sagar_c_k',
 'Justin']

**Extracting Question ID**

In [22]:
id_elements = soup.select('.s-post-summary--content > .s-post-summary--content-title > a')
all_question_ids = [element['href'].split('/')[2] for element in id_elements]
all_question_ids[:10]

['73315669',
 '73315667',
 '73315665',
 '73315663',
 '73315662',
 '73315661',
 '73315660',
 '73315659',
 '73315658',
 '73315657']

**Extracting Post Stats**

In [23]:
stats_element = soup.select('.s-post-summary--stats')
questions_upvotes = []
questions_answers = []
questions_views = []
for el in stats_element:
    children = list(el.findChildren('div'))
    questions_upvotes.append(
        children[0].find('span').text
    )
    questions_answers.append(
        children[1].find('span').text
    )
    questions_views.append(
        children[2].find('span').text
    )
print(questions_upvotes[:10])
print(questions_answers[:10])
print(questions_views[:10])

['0', '0', '0', '0', '0', '0', '-1', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['3', '2', '2', '2', '2', '3', '3', '4', '3', '3']


**Extracting Question Meta Tags**

In [24]:
meta_tags_container = soup.find_all('div', 's-post-summary--meta-tags')
all_question_meta_tags = []
for question_tag_container in meta_tags_container:
    temp = []
    for child in question_tag_container.findChildren('a'):
        temp.append(child.text)
    all_question_meta_tags.append(temp)

all_question_meta_tags[:5]

[['bash', 'shell', 'script'],
 ['c++', 'stl', 'sfinae'],
 ['javascript', 'node.js', 'discord', 'discord.js', 'rcon'],
 ['c#', '.net', 'filesystemwatcher'],
 ['apache-spark']]

## Creating Our Dataframe

In [25]:
from itertools import chain

questions_info_df = pd.DataFrame({
    'id': all_question_ids,
    'title': all_question_titles,
    'excerpt': all_question_excerpts,
    'username': all_question_usernames,
    'upvotes': questions_upvotes,
    'views': questions_views,
    'answers': questions_answers,
})

question_ids_multiplied = list(chain(
    *[[id]*len(meta_tags) for id, meta_tags in zip(all_question_ids, all_question_meta_tags)]
))      # Don't freak out!!! It's easy!!

meta_tags_df = pd.DataFrame({
    'id': question_ids_multiplied,
    'meta_tag': list(chain(*all_question_meta_tags))
})

In [26]:
questions_info_df.head()

Unnamed: 0,id,title,excerpt,username,upvotes,views,answers
0,73315669,How to check ! $ & present in password variabl...,\r\n I am writing shell script ...,Dhananjaya D N,0,3,0
1,73315667,How to implement a std::function with operator...,\r\n I'm learning to implement ...,SynchronicK,0,2,0
2,73315665,Promise to return instead console.log,\r\n I'm trying to do a Discord...,Lsyk4,0,2,0
3,73315663,FileSystemWatcher - The directory name does no...,\r\n I have a FileSystemWatcher...,Kurubaran,0,2,0
4,73315662,How to identify that accumulated metadata is a...,\r\n I saw that we can trigger ...,Davisson Paulino,0,2,0


In [27]:
meta_tags_df.head(10)

Unnamed: 0,id,meta_tag
0,73315669,bash
1,73315669,shell
2,73315669,script
3,73315667,c++
4,73315667,stl
5,73315667,sfinae
6,73315665,javascript
7,73315665,node.js
8,73315665,discord
9,73315665,discord.js


## Putting It All Together

In [28]:
def crawl_stackoverflow(pagenumber):
    html = requests.get(f'https://stackoverflow.com/questions?tab=newest&pagesize=50&page={pagenumber}')
    soup = BeautifulSoup(html.text)
    # ---- Titles ---- #
    all_question_title_elements = soup.select('.s-post-summary--content-title > a')
    all_question_titles = [element.text for element in all_question_title_elements]

    # ---- Excerpts ---- #
    all_question_excerpt_elements = soup.find_all('div', 's-post-summary--content-excerpt')
    all_question_excerpts = [element.text for element in all_question_excerpt_elements]

    # ---- Usernames ---- #
    all_question_username_elements = soup.select('.s-user-card--info > .s-user-card--link > a')
    all_question_usernames = [element.text for element in all_question_username_elements]

    # ---- Question IDs ---- #
    id_elements = soup.select('.s-post-summary--content > .s-post-summary--content-title > a')
    all_question_ids = [element['href'].split('/')[2] for element in id_elements]

    # ---- Question Stats ---- #
    stats_element = soup.select('.s-post-summary--stats')
    questions_upvotes = []
    questions_answers = []
    questions_views = []
    for el in stats_element:
        children = list(el.findChildren('div'))
        questions_upvotes.append(
            children[0].find('span').text
        )
        questions_answers.append(
            children[1].find('span').text
        )
        questions_views.append(
            children[2].find('span').text
        )

    # ---- Question Meta Tags ---- #
    meta_tags_container = soup.find_all('div', 's-post-summary--meta-tags')
    all_question_meta_tags = []
    for question_tag_container in meta_tags_container:
        temp = []
        for child in question_tag_container.findChildren('a'):
            temp.append(child.text)
        all_question_meta_tags.append(temp)


    # ---- Creating Dataframes ---- #
    questions_info_df = pd.DataFrame({
        'id': all_question_ids,
        'title': all_question_titles,
        'excerpt': all_question_excerpts,
        'username': all_question_usernames,
        'upvotes': questions_upvotes,
        'views': questions_views,
        'answers': questions_answers,
    })

    question_ids_multiplied = list(chain(
        *[[id]*len(meta_tags) for id, meta_tags in zip(all_question_ids, all_question_meta_tags)]
    ))      # Don't freak out!!! It's easy!!

    meta_tags_df = pd.DataFrame({
        'id': question_ids_multiplied,
        'meta_tag': list(chain(*all_question_meta_tags))
    })

    return questions_info_df, meta_tags_df
    

In [29]:
from tqdm import tqdm

n_pages = 100
question_info_df = []
meta_tags_df = []

for p in tqdm(range(1, n_pages+1)):
    q_df, mt_df = crawl_stackoverflow(p)
    question_info_df.append(q_df.copy())        # duplicated dataframes!!!
    meta_tags_df.append(mt_df.copy())
    # print(q_df.loc[:10, 'id'])
question_info_df = pd.concat(question_info_df)
meta_tags_df = pd.concat(meta_tags_df)

100%|██████████| 100/100 [03:30<00:00,  2.11s/it]


In [30]:
print(question_info_df.shape)
question_info_df.head()

(5000, 7)


Unnamed: 0,id,title,excerpt,username,upvotes,views,answers
0,73315669,How to check ! $ & present in password variabl...,\r\n I am writing shell script ...,Dhananjaya D N,0,3,0
1,73315667,How to implement a std::function with operator...,\r\n I'm learning to implement ...,SynchronicX,0,3,0
2,73315665,Promise to return instead console.log,\r\n I'm trying to do a Discord...,Lsyk4,0,2,0
3,73315663,FileSystemWatcher - The directory name does no...,\r\n I have a FileSystemWatcher...,Kurubaran,0,2,0
4,73315662,How to identify that accumulated metadata is a...,\r\n I saw that we can trigger ...,Davisson Paulino,0,2,0


In [31]:
print(meta_tags_df.shape)
meta_tags_df.head()

(15026, 2)


Unnamed: 0,id,meta_tag
0,73315669,bash
1,73315669,shell
2,73315669,script
3,73315667,c++
4,73315667,stl


In [32]:
print(question_info_df.duplicated().sum())
print(meta_tags_df.duplicated().sum())

39
120


In [33]:
question_info_df = question_info_df.drop_duplicates()
meta_tags_df = meta_tags_df.drop_duplicates()

In [34]:
print(question_info_df.excerpt.values[0])
question_info_df.excerpt = question_info_df.excerpt.replace(r'^\r\n\s+', '', regex=True) \
                                                   .replace(r'\r\n\s+$', '', regex=True)


                I am writing shell script which will validate entered password which should not accept ! $ & sign in password. I need to throw error messages. Kindly help me here.
Here problem occurring when I ...
            


In [35]:
question_info_df.dtypes     # <-- we need to do some cleaning

id          object
title       object
excerpt     object
username    object
upvotes     object
views       object
answers     object
dtype: object

In [36]:
question_info_df[['upvotes', 'views', 'answers']] = question_info_df[['upvotes', 'views', 'answers']].astype('int')
question_info_df.dtypes

id          object
title       object
excerpt     object
username    object
upvotes      int64
views        int64
answers      int64
dtype: object

In [37]:
question_info_df = question_info_df.set_index('id')
meta_tags_df = meta_tags_df.set_index('id')

In [38]:
question_info_df.to_csv('question_info.csv')
meta_tags_df.to_csv('meta_tags.csv')

## Some Data Analysis

In [39]:
popular_tags = meta_tags_df.value_counts().sort_values()[::-1]
popular_tags[:10]

meta_tag  
python        814
javascript    560
java          259
reactjs       257
c#            232
html          209
              162
css           161
pandas        160
node.js       155
dtype: int64