In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests

# Extracting questions from W3resources to CSV File

## Extracting title with BeautifulSoup

Some features that make BeautifulSoup a powerful solution are:
- It provides a lot of simple methods and Pythonic idioms for navigating, searching, and modifying a DOM tree. It doesn't take much code to write an application
- Beautiful Soup sits on top of popular Python parsers like lxml and html5lib, allowing you to try out different parsing strategies or trade speed for flexibility.

Basically, BeautifulSoup can parse anything on the web you give it.

In [12]:
page = requests.get('https://www.w3resource.com/python-exercises/string/')

In [13]:
soup = BeautifulSoup(page.content,'html.parser')

## we can look at the body to find the tags for required information

In [6]:
soup.body

<body>
<style type="text/css">
article a {
text-decoration: none	
}
.mdl-menu {
min-width: 1024px;	
}
.mdl-menu__item {
height: 24px;
line-height: 24px;
font-size: 1em;
padding: 3px	
}
/*.mdl-menu__item {
height: 48px;
width:48px;
}*/
.header_notice a{
color: #fff
}
ul.nav.nav-list li {
font-size: 1em	
}
.mdl-layout__drawer .mdl-navigation .mdl-navigation__link {
color: #2a69a8;	
font-weight: 600;
font-size: 1.2em
}
.material-icons.home_link { font-size: 48px; }
#drawer_menu_topic_head1,#drawer_menu_topic_head2,#drawer_menu_topic_head3,#drawer_menu_topic_head4,
#drawer_menu_topic_head5,#drawer_menu_topic_head6,#drawer_menu_topic_head7 {
color: #ff6a01;
font-size: 1.5em;
font-weight: 700
}
.mdl-layout-title a {
color: #fff
}
@media screen and (max-width: 1024px) {	
	.drawer_logo {display: none}
}
@media screen and (max-width: 568px) {
#sidebar_right {
display: none
}
}	
@media screen and (max-width: 768px) {
ul.nav.nav-list {
display: none
}
}	

.w3r_donate_link {
font-weight: 800
}
.w3

In [14]:
# selecting the paragraph tag as our questions are mentioned in <p> tags text part

p_tags = soup.select('p')
p_tags

[<p>Python has a built-in string class named "str" with many useful features. String literals can be enclosed by either single or double, although single quotes are more commonly used.</p>,
 <p><strong>You may read our <a href="https://www.w3resource.com/python/python-string.php" target="_blank">Python string</a> tutorial before solving the following exercises.</strong></p>,
 <p class="heading">[<em>An editor is available at the bottom of the page to write and execute the scripts.</em>] </p>,
 <p><strong>1.</strong> Write a Python program to calculate the length of a string. <a href="#EDITOR">Go to the editor</a><br/>
 <a href="python-data-type-string-exercise-1.php" target="_blank">Click me to see the sample solution</a></p>,
 <p><strong>2.</strong> Write a Python program to count the number of characters (character frequency) in a string. <a href="#EDITOR">Go to the editor</a>
 <br/>
 Sample String : google.com'<br/>
 Expected Result : {'g': 2, 'o': 3, 'l': 1, 'e': 1, '.': 1, 'c': 1,

In [15]:
# Creating an empty list to store the paragraph items

final = []

for text in p_tags:
    if 'Write' in text.getText():
        final.append(text.getText().split('.')[1])

## Creating Dataframe to store the Questions

In [16]:
dct = {'Questions':final}
df = pd.DataFrame(dct,index=np.arange(1,len(final)+1))
df[:3]

Unnamed: 0,Questions
1,Write a Python program to calculate the lengt...
2,Write a Python program to count the number of...
3,Write a Python program to get a string made o...


## Extract Title and save the contents to the csv file

In [17]:
# extraxting the title of the page to use it as the filename we want to save

pagetitle = soup.title.text


# getting the title name
title = ' '.join(pagetitle.split())


# cleaning the title name
fname = ''
for i in title:
    if i in ['-',',',':']:
        pass
    else:
        fname+=i
fname = fname.replace(' ',"_")


# preparing the new filename
file_name = fname+'.csv'


# saving the file with title name into csv
df.to_csv(file_name)


# Creating the small application for above extraction program


In [None]:
import pyttsx3
from tkinter import *
import os

window = Tk()

window.geometry('450x300')
window.resizable(0,0)

window.title('Extract Question from W3Rescource for practice')
window.configure(bg='AliceBlue')

engine = pyttsx3.init()  # object

# title label
Label(window,text='Questions to CSV',font='Arial 20 bold',bg='AliceBlue',fg='Red').pack()
# bottom label


# text box label and text box
msg = StringVar()
Label(window, text= 'Enter link', font='Arial 15 bold',bg='AliceBlue').place(x=40,y=60)
entry_field = Entry(window, textvariable=msg, width=60)
entry_field.place(x=40,y=100,height=25)

def extract():
    url = entry_field.get()
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')
    
    
    # selecting the paragraph tag as our questions are mentioned in <p> tags text part
    p_tags = soup.select('p')
    
    
    # Creating an empty list to store the paragraph items
    final = []
    for text in p_tags:
        if 'Write' in text.getText():
            final.append(text.getText().split('.')[1])
            
    
    # Creating Dataframe to store the Questions
    dct = {'Questions':final}
    df = pd.DataFrame(dct,index=np.arange(1,len(final)+1))
    
    
    # extraxting the title of the page to use it as the filename we want to save
    pagetitle = soup.title.text


    # getting the title name
    title = ' '.join(pagetitle.split())


    # cleaning the title name
    fname = ''
    for i in title:
        if i in ['-',',',':']:
            pass
        else:
            fname+=i
    fname = fname.replace(' ',"_")


    # preparing the new filename
    file_name = fname+'.csv'


    # saving the file with title name into csv
    df.to_csv(file_name)
    
    engine.stop()

def exit():
    window.destroy()

def reset():
    msg.set('')
    
Button(window,text='Extract',font='Arial 15 bold', command=extract,bg='Orange').place(x=60,y=150)
Button(window,text='Reset',font='Arial 15 bold', command=reset,bg='Orange').place(x=180,y=150)
Button(window,text='Exit',font='Arial 15 bold', command=exit,bg='Orange').place(x=310,y=150)
Label(window, text= 'NOTE: This App is strictly for w3rescource.com', font='Arial 12 bold',bg='AliceBlue').place(x=40,y=260)

window.mainloop()