# Parser Project Presentation
<hr>

In [None]:
from Project.Parser.parser import Parser

In [None]:
# define the needed URL
URL = r"https://www.tmforum.org/membership/current-members/"

# and create the Parser object
parser = Parser(url=URL)

In [None]:
# the parser extracts the domain name from the given URL
print(parser.domain_name)

In [None]:
# we can check the request status
parser.check_request_status()

### Parsing websites
<hr>

In [None]:
# we can define the preferred parser
# and also save the website's HTML to an HTML file to avoid sending too many requests to the server
filename = "parsing_result_1.html"

parser.parse(save_to_file=filename)

In [None]:
# the parsed data is also stored in a clean HTML form of the Parser class attribute and can be used separately
print(parser.content)

In [None]:
# we can read the file with our parsing results
website_content = parser.read_from_file(filename)

print(website_content)

### Parsing content with Beautifulsoup logic
<hr>

In [None]:
# the Parser class allows searching for the needed content using both the Beautifulsoup "find" and "find_all" methods
# and the XPath logic

# we can find either one specific element or all the elements by using the "find_all" attribute
rows = parser.search_html("li", {"class": "span4 tmf-current-members-column"}, find_all=True)

# let's check how many elements we've got
print(len(rows))

In [None]:
# and check several elements
for i in rows[:3]:
    print(i)

In [None]:
# first, let's find all the company names
company_names = []

for i in rows:
    company_names.append(i.text.strip())
    
for i in company_names[:5]:
    print(i)

In [None]:
# secondly, we need to get all the URLs from those elements as well
links = []

for i in rows:
    links.append(i.a["href"])
    
for i in links[:5]:
    print(i)

In [None]:
# we can come up with the right URLS by joining them with the "domain_name" attribute
links = [f"{parser.domain_name}{i}" for i in links]
    
for i in links[:5]:
    print(i)

In [None]:
# finally, let's retrieve all the company websites
# the XPath pattern has to be defined beforehand
websites = parser.crawl(links[0:3], xpath_pattern="//p/a/@href")

for i in websites:
    print(i)

### Parsing content with XPath logic
<hr>

In [None]:
# let's repeat everything, but this time with XPath logic
# the XPatch pattern has to be defined beforehand
# first, let's find company names
names = parser.search_xpath("//li[@class='span4 tmf-current-members-column']/a/text()")

# and clean the names
names = [i.strip() for i in names]

# let's check how many names we've got
print(len(names))

In [None]:
# let's check several names
for i in names[:5]:
    print(i)

In [None]:
# next, let's find the needed URLS
links = parser.search_xpath("//li[@class='span4 tmf-current-members-column']/a/@href")

for i in links[:5]:
    print(i)

In [None]:
# join them with the domain name
links = [f"{parser.domain_name}{i.strip()}" for i in links]

for i in links[:5]:
    print(i)

In [None]:
# finally, let's crawl these links
# it's also possible to set the sleep timer
websites = parser.crawl(links[0:3], xpath_pattern="//p/a/@href", sleep_timer=2)

for i in websites:
    print(i)

In [None]:
# also, there's an option to retry the last operation performed by Parser (for instance, in case of errors)
# all we need is to use the "retry" method
website = parser.retry()

for i in websites:
    print(i)

### Using random headers
<hr>

In [None]:
# Parser can randomize its headers upon its initialization
# all the headers are stored in the "headers.json" file, which comes with the package
# let's create several Parser instances to see that it's working properly
for i in range(5):
    parser = Parser(url=URL, random_headers=True)
    print(parser.headers["User-Agent"])

In [None]:
# but we can also change headers of the existing parser if necessary
# let's create a new Parser instance with default headers
parser = Parser(url=URL, random_headers=False)
print(parser.headers["User-Agent"])

# and apply the "change_headers" method to change the headers
parser.change_headers()
print(parser.headers["User-Agent"])

# Parser also supports mobile user agents
parser.change_headers()
print(parser.headers["User-Agent"])

### One more Parser test
<hr>

In [None]:
# now let's run a new test from scratch
# this time, we'll be parsing user agents from the following URL
URL = r"https://www.useragents.me/"

parser = Parser(URL, random_headers=True)
print(parser.domain_name)
parser.check_request_status()

# parse the website without saving the HTML content to a file
content = parser.parse()

# we'll parse the website using XPath
xpath_pattern = r"//div/textarea[@class='form-control ua-textarea']/text()"
user_agents = parser.search_xpath(xpath_pattern)

# finally, let's check the result
for i in user_agents:
    print(i)

### Final Parser test
<hr>

In [None]:
# here's another test for my Parser class
# this time we'll be parsing speakers from some website that I found on the Internet
URL = r"https://www.vabio.org/about/board-members/"

parser = Parser(URL, random_headers=True)

parser.parse()

# retrieve the needed elements using Beautifulsoup logic
elements = parser.search_html("section", {"class": "team"})

# get the list of full names
names = elements.find_all("span", {"class": "member-name"})
names = [i.text for i in names]

# get the list of people's job titles
titles = elements.find_all("span", {"class": "member-title"})
titles = [i.text for i in titles]

# get the list of company names
companies = elements.find_all("span", {"class": "member-company"})
companies = [i.text for i in companies]

# and get the list of personal LinkedIn profiles using XPath logic
linkedin_links = parser.search_xpath("//div[@class='member-info']/a/@href")

# finally, save the result to a JSON file 
labels = ["name", "title", "company", "LI_url"]
dataset = [names, titles, companies, linkedin_links]

result = []

index = 0
for i in range(len(names)):
    result.append({
        "name": names[index],
        "title": titles[index],
        "company_name": companies[index],
        "linkedin_url": linkedin_links[index]
    })
    index += 1

parser.to_json("parsing_result_2.json", result)

# the end result can be checked in the "parsing_result_2.json" file

## Thank you for your attention!