<h1 align='center'><font color ='brown'> Scrapping </font></h2>

In [1]:
import requests # for making HTTP requests to a web server
from bs4 import BeautifulSoup # for parsing HTML and XML documents
from itertools import zip_longest #  a function for grouping iterables into fixed-length chunks, filling in missing values with a specified fill value
import csv
import re # a module for working with regular expressions
import pandas as pd

### Make Get Request

In [2]:
request = requests.get('https://wuzzuf.net/search/jobs/?q=data+analyst&a=navbl')
# request
# A status code of 200 indicates that the request was successful and the server responded with the expected content.
if request.status_code == 200:
    print("Request successful!")
else:
    print(f"Request failed with status code {request.status_code}")

Request successful!


In [3]:
# This code accesses the HTML content of the web page that was returned by the server.
source = request.content
source[:200] #returns the first 200 characters of the HTML content.
             #This is a useful way to quickly inspect the structure and content of the HTML document.

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0, s'

In [4]:
# this code creates a BeautifulSoup object by passing the source variable as the first argument and specifying the parser to use as the second argument.
soup = BeautifulSoup(source, 'html.parser')
soup
"""By printing the soup variable, you can see the parsed HTML content of the web page, with the structure and tags represented as nested objects.
This can be useful for understanding the structure of the web page and identifying the relevant tags and attributes needed to extract specific information from the page."""

'By printing the soup variable, you can see the parsed HTML content of the web page, with the structure and tags represented as nested objects.\nThis can be useful for understanding the structure of the web page and identifying the relevant tags and attributes needed to extract specific information from the page.'

In [5]:
# The find_all() method returns a list of all matching elements, which is stored in the containers variable.
containers = soup.find_all('div', class_='css-1gatmva e1v1l3u10')
len(containers)

15

In [6]:
# This code accesses the first element in the list by indexing containers[0]
containers[0].find_all('h2', class_="css-m604qf")

[<h2 class="css-m604qf"><style data-emotion="css o171kl">.css-o171kl{-webkit-text-decoration:none;text-decoration:none;color:inherit;}</style><a class="css-o171kl" href="/jobs/p/aiRSu9WV14YS-Data-Analyst-ITCan-Cairo-Egypt?o=1&amp;l=sp&amp;t=sj&amp;a=data analyst|search-v3|navbl" rel="noreferrer" target="_blank">Data Analyst</a></h2>]

### Scrapping Job Title 

In [7]:
# The text attribute of this element is then accessed using `.text` to extract the text content of the <h2> element, which likely corresponds to the job title of the first job listing on the web page.
job_title=containers[0].find_all('h2', class_="css-m604qf")
job_title[0].text

'Data Analyst'

### Scrapping Company Location 

In [8]:
# extract company location of the first job listing on the web page.
company_location = containers[0].find_all('span', class_='css-5wys0k')
company_location[0].text

'New Cairo, Cairo, Egypt '

### Scrapping Company Name 

In [9]:
# extract company name of the first job listing on the web page.
company_name = containers[0].find_all('a', class_='css-17s97q8')
company_name[0].text

'ITCan -'

### Scrapping Job Skill

In [10]:
# extract job_skill of the first job listing on the web page.
job_skill = containers[0].find_all('div', class_='css-y4udm8')
job_skill[0].text

'Full TimeExperienced · 3 - 5 Yrs of Exp · Analyst/Research · BI · Analysis · analytical · Computer Science · Data Analysis · Development · Engineering · Information Technology (IT) · Power BI'

### Scrapping Job Type 

In [11]:
# extract job type of the first job listing on the web page.
job_type = containers[0].find_all('span', class_='css-1ve4b75 eoyjyou0')
job_type[0].text

'Full Time'

### Scrapping Time 

In [12]:
# extract time of the first job listing on the web page.
time = (containers[0].find_all('div', class_="css-4c4ojb")) + (soup.find_all('div', class_="css-do6t5g"))
time[0].text

'8 days ago'

### For now, we have extracted elements we need in one container/box. <font color='green'>We **need to scrape all containers and save all data in CSV file**.</font>

### Scraping whole page

In [13]:
import requests
from bs4 import BeautifulSoup
import csv

# Make a GET request to the URL
url = 'https://wuzzuf.net/search/jobs/?q=data+analyst&a=navbl'
response = requests.get(url)

# Create a BeautifulSoup object from the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the job containers on the page
job_containers = soup.find_all('div', class_='css-1gatmva e1v1l3u10')

# Create empty lists to store the job information
job_titles = []
company_names = []
company_locations = []
job_skills = []
job_types = []
job_posted_times = []

# Open the CSV file and write the header row
with open('data analyst.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['job_titles', 'company_locations', 'company_names', 'job_skills', 'job_types', 'time'])
    
    # Loop through each job container and extract the job information
    for container in job_containers:
        # Extract the job title
        title = container.find('h2', class_='css-m604qf').text
        job_titles.append(title)

        # Extract the company name and location
        company_name = container.find_all('a', class_='css-17s97q8')[0].text.strip()
        company_names.append(company_name)
        
        # Extract the company location
        try:
            company_location = container.find_all('span', class_='css-5wys0k')[0].text.strip()
        except IndexError:
            company_location = 'nan'
        company_locations.append(company_location)

        # Extract the job skills
        skills = container.find('div', class_='css-y4udm8').text.strip()
        job_skills.append(skills)

        # Extract the job type
        job_type = container.find('span', class_='css-1ve4b75 eoyjyou0').text.strip()
        job_types.append(job_type)

        # Extract the job posted time
        try:
            job_posted_time = ((container.find_all('div', class_='css-4c4ojb')) + (soup.find_all('div', class_='css-do6t5g')))[0].text.strip()
        except IndexError:
            job_posted_time = 'nan'
        job_posted_times.append(job_posted_time)
        
        # Write the job information to the CSV file
        writer.writerow([title, company_location, company_name, skills, job_type, job_posted_time])


In [14]:
data = pd.read_csv('data analyst.csv')
data.head(15)

Unnamed: 0,job_titles,company_locations,company_names,job_skills,job_types,time
0,Data Analyst,"New Cairo, Cairo, Egypt",ITCan -,Full TimeExperienced · 3 - 5 Yrs of Exp · Anal...,Full Time,8 days ago
1,Data Analyst,"Sheraton, Cairo, Egypt",London International Patient Services -,Full TimeEntry Level · 1 - 2 Yrs of Exp · IT/S...,Full Time,8 days ago
2,Data Analyst,"Dokki, Giza, Egypt",Safa International Travel -,Full TimeExperienced · 3 - 5 Yrs of Exp · Busi...,Full Time,8 days ago
3,Data Analyst,"Maadi, Cairo, Egypt",AL-Matar -,Full TimeEntry Level · 2 - 4 Yrs of Exp · IT/S...,Full Time,8 days ago
4,Senior Data Analyst,"Smart Village, Giza, Egypt",Fawry for Banking Technology and Electronic Pa...,Full TimeExperienced · 2 - 5 Yrs of Exp · Busi...,Full Time,8 days ago
5,Electronic Data Interchange (EDI) Analyst (Flu...,"Cairo, Egypt",FlairsTech -,Full TimeExperienced · 1 - 3 Yrs of Exp · IT/S...,Full Time,8 days ago
6,Medical Data Analyst-Private Hospital Background,"New Cairo, Cairo, Egypt",Confidential -,Full TimeExperienced · 3 - 5 Yrs of Exp · Medi...,Full Time,4 hours ago
7,Data Analyst,"Katameya, Cairo, Egypt",Othaim -,Full TimeExperienced · 3 - 15 Yrs of Exp · IT/...,Full Time,8 days ago
8,Data Analyst,"Heliopolis, Cairo, Egypt",Confidential -,Full TimeExperienced · 2 - 6 Yrs of Exp · Busi...,Full Time,8 days ago
9,Data Analyst,"Zamalek, Cairo, Egypt",Al Ahly capital holding - Al Ahly Tamkeen -,Full TimeExperienced · 1 - 3 Yrs of Exp · IT/S...,Full Time,8 days ago


In [15]:
data.shape

(15, 6)

### Scraping multiple pages

In [16]:
# Make a GET request to the URL
url = 'https://wuzzuf.net/search/jobs/?q=data+analyst&a=navbl'
response = requests.get(url)

# Create a BeautifulSoup object from the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the job containers on the page
job_containers = soup.find_all('div', class_='css-1gatmva e1v1l3u10')

# Create empty lists to store the job information
job_titles = []
company_names = []
company_locations = []
job_skills = []
job_types = []
job_posted_times = []

# Open the CSV file and write the header row
with open('all data analyst jobs.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['job_titles', 'company_locations', 'company_names', 'job_skills', 'job_types', 'time'])
    
    for i in range(1,15):
        url = f'https://wuzzuf.net/search/jobs/?q=data+analyst&a=navbl={i}'
        response = requests.get(url)


        containers = soup.find_all('div', {'class':'resItemBox resItemBoxBooks exactMatch'})

    
        # Loop through each job container and extract the job information
        for container in job_containers:
            # Extract the job title
            title = container.find('h2', class_='css-m604qf').text
            
            job_titles.append(title)

            # Extract the company name and location
            company_name = container.find_all('a', class_='css-17s97q8')[0].text.strip()
            company_names.append(company_name)

            # Extract the company location
            try:
                company_location = container.find_all('span', class_='css-5wys0k')[0].text.strip()
            except IndexError:
                company_location = 'nan'
            company_locations.append(company_location)

            # Extract the job skills
            skills = container.find('div', class_='css-y4udm8').text.strip()
            job_skills.append(skills)

            # Extract the job type
            job_type = container.find('span', class_='css-1ve4b75 eoyjyou0').text.strip()
            job_types.append(job_type)

            # Extract the job posted time
            try:
                job_posted_time = ((container.find_all('div', class_='css-4c4ojb')) + (soup.find_all('div', class_='css-do6t5g')))[0].text.strip()
            except IndexError:
                job_posted_time = 'nan'
            job_posted_times.append(job_posted_time)

            # Write the job information to the CSV file
            writer.writerow([title, company_location, company_name, skills, job_type, job_posted_time])


In [17]:
df = pd.read_csv('all data analyst jobs.csv')
df.shape

(210, 6)

### Let's see data

In [18]:
df.head()

Unnamed: 0,job_titles,company_locations,company_names,job_skills,job_types,time
0,Data Analyst,"New Cairo, Cairo, Egypt",ITCan -,Full TimeExperienced · 3 - 5 Yrs of Exp · Anal...,Full Time,8 days ago
1,Data Analyst,"Sheraton, Cairo, Egypt",London International Patient Services -,Full TimeEntry Level · 1 - 2 Yrs of Exp · IT/S...,Full Time,8 days ago
2,Data Analyst,"Dokki, Giza, Egypt",Safa International Travel -,Full TimeExperienced · 3 - 5 Yrs of Exp · Busi...,Full Time,8 days ago
3,Data Analyst,"Maadi, Cairo, Egypt",AL-Matar -,Full TimeEntry Level · 2 - 4 Yrs of Exp · IT/S...,Full Time,8 days ago
4,Senior Data Analyst,"Smart Village, Giza, Egypt",Fawry for Banking Technology and Electronic Pa...,Full TimeExperienced · 2 - 5 Yrs of Exp · Busi...,Full Time,8 days ago


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   job_titles         210 non-null    object
 1   company_locations  210 non-null    object
 2   company_names      210 non-null    object
 3   job_skills         210 non-null    object
 4   job_types          210 non-null    object
 5   time               210 non-null    object
dtypes: object(6)
memory usage: 10.0+ KB


### We can remove `-` from Company Names column

In [20]:
# Define a lambda function to remove hyphens from company_names column
remove_hyphen = lambda x: x.replace('-', '')

# Apply the lambda function to the company_names column
df['company_names'] = df['company_names'].apply(remove_hyphen)

# Print the updated dataframe
df.head()

Unnamed: 0,job_titles,company_locations,company_names,job_skills,job_types,time
0,Data Analyst,"New Cairo, Cairo, Egypt",ITCan,Full TimeExperienced · 3 - 5 Yrs of Exp · Anal...,Full Time,8 days ago
1,Data Analyst,"Sheraton, Cairo, Egypt",London International Patient Services,Full TimeEntry Level · 1 - 2 Yrs of Exp · IT/S...,Full Time,8 days ago
2,Data Analyst,"Dokki, Giza, Egypt",Safa International Travel,Full TimeExperienced · 3 - 5 Yrs of Exp · Busi...,Full Time,8 days ago
3,Data Analyst,"Maadi, Cairo, Egypt",ALMatar,Full TimeEntry Level · 2 - 4 Yrs of Exp · IT/S...,Full Time,8 days ago
4,Senior Data Analyst,"Smart Village, Giza, Egypt",Fawry for Banking Technology and Electronic Pa...,Full TimeExperienced · 2 - 5 Yrs of Exp · Busi...,Full Time,8 days ago
