In [81]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup as bsp
from selenium import webdriver

import re
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [26]:
# for testing
browser = webdriver.Chrome()
browser.get("https://wuzzuf.net/jobs/careers/Nakheel-Egypt-85138")
html_content = browser.page_source
jsoup = bsp(html_content, "html.parser")
browser.quit()

In [44]:
def nullify(test):
    if re.compile(r'\b\s*not specified\s*\b', re.IGNORECASE).search(test) or re.compile(r'\b\s*confidential\s*\b', re.IGNORECASE).search(test):
        return True
    return False

In [45]:
def get_title(soup):
    title = soup.find("h1", class_ = "css-f9uh36")
    return title.text

In [46]:
def get_description(soup):
    description = soup.select(".css-ghicub > .css-1uobp1k")
    description = re.sub(r'([a-z,.,:])([A-Z])', r'\1 \2', description[0].text)
    return description

In [None]:
def get_requirements(soup):
    reqs = soup.select(".css-ghicub > .css-1t5f0fr")  #css-1t5f0fr
    if len(reqs)>0:
        reqs = re.sub(r'([a-z,.,:])([A-Z])', r'\1 \2', reqs[0].text)
        return reqs
    return None
#     if len(reqs)>0:
#         return reqs[0].text
#     return None
#     if len(reqs):
#         listings = reqs[0].find_all('li')
#         if len(listings):
#             requirements = ""
#             for req in listings:
#                 requirements += "\n" + req.text
#             return requirements[1:]
#         else:
#             listings = reqs[0].find_all("p")
#             if len(listings):
#                 requirements = ""
#                 for req in listings:
#                     requirements += "\n" + req.text
#                 return requirements[1:]
#     else:
#         return None
print(get_requirements(jsoup))

In [48]:
def get_date(soup):  # it has to exist
    jdate = soup.find("span", class_ = "css-182mrdn")
    date = jdate.text.split(' ')
    current_date = datetime.now()
    if date[2] in ('minute', 'minutes', 'hour', 'hours'):
        posted_date = current_date
    elif date[2] == 'day' or date[2] == 'days':
        posted_date = current_date - relativedelta(days=int(date[1]))
    else:
        posted_date = current_date - relativedelta(months = int(date[1]))
    
    return posted_date.strftime('%Y-%m-%d')

In [49]:
def get_views(soup):
    views = soup.find_all(class_ = "css-bs44nc")
    if len(views):
        return (int(views[0].text), int(views[1].text))
    else:
        return (None, None)

In [None]:
def get_details(soup):
    details = soup.select(".css-47jx3m > .css-4xky9y")
    # experience
    
    pattern = r'\s*\d+\s*' #this is a number
    
    # Loop through the list of strings and extract numbers
    for s in details[0]:
        match = re.search(pattern, s)
        if match:
            number_str = match.group().strip()  # Remove leading/trailing spaces
            number = int(number_str)
            break
    
    if nullify(details[0].text):
        experience_min = None
        experience_max = None
    else:
        exp = details[0].text.split(' ')
        pattern = r'\s*\d+\s*'
        
        numbers = []

        for s in exp:
            match = re.search(pattern, s)
            if match:
                number_str = match.group().strip()  # Remove leading/trailing spaces
                number = int(number_str)
                numbers.append(number)
        if len(numbers) > 1:  
            experience_min = int(exp[0])
            experience_max = int(exp[2])
        elif re.compile(r'\b\s*more\s*\b', re.IGNORECASE).search(exp[0]):
            experience_min = numbers[0]
            experience_max = None
        elif re.compile(r'\b\s*less\s*\b', re.IGNORECASE).search(exp[0]):
            experience_min = None
            experience_max = numbers[0]
        else:
            experience_min = None
            experience_max = None

    # career_level
    career_level = details[1].text
    if nullify(career_level):
        career_level = None
        
    # education_level
    education_level = details[2].text
    if nullify(education_level):
        education_level = None
    
    # gender
    if len(details) == 5:
        if re.compile(r'\b\s*male\s*\b', re.IGNORECASE).search(details[3].text):
            gender = 'M'
        else:
            gender = 'F'
        del details[3]
    else:
        gender = None
        
    
    # salary
    salary = details[3].text
    if nullify(salary):
        salary_min = None
        salary_max = None
        hidden_salary = True
    else:
        salary = salary.split(' ')
        salary_min = int(salary[0])
        salary_max = int(salary[2])
        hidden_salary = False
    return (experience_min, experience_max, career_level, education_level, salary_min, salary_max, hidden_salary, gender)
get_details(jsoup)

In [51]:
def get_vacancies(soup):
    vacancies = soup.find("span", class_ = "css-ixb653")
    vacancies = vacancies.text.split(' ')
    
    # Regular expression pattern to match integers with optional leading/trailing spaces
    pattern = r'\s*\d+\s*'
    
    # Loop through the list of strings and extract numbers
    for s in vacancies:
        match = re.search(pattern, s)
        if match:
            number_str = match.group().strip()  # Remove leading/trailing spaces
            number = int(number_str)
            break
    return number

In [None]:
def get_company_name(soup):
    company = soup.find('a', class_ = "css-p7pghv")
    if company is None:
        return "Confidential"
    return company.text
print(get_company_name(jsoup))

In [53]:
# job_posting
def job_posting(jsoup):
    dic={}
#     try:
    job_title = get_title(jsoup)
    dic["job_title"] = job_title
#     print(job_title)

    description = get_description(jsoup)
#     print(description)
    dic["description"] = description

    requirements = get_requirements(jsoup)
#     print(requirements)
    dic["requirements"] = requirements

    post_date = get_date(jsoup)
#     print(post_date)
    dic["post_date"] = post_date

    (viewed, considered) = get_views(jsoup)
    dic["viewed"] = viewed
    dic["considered"] = considered
    (experience_min, experience_max, career_level, education_level, salary_min, salary_max, hidden_salary, gender) = get_details(jsoup)
    # print(experience_min, experience_max, career_level, education_level, salary_min, salary_max, hidden_salary, gender)
    dic["experience_min"] = experience_min
    dic["experience_max"] = experience_max
    dic["career_level"] = career_level
    dic["education_level"] = education_level
    dic["salary_min"] = salary_min
    dic["salary_max"] = salary_max
    dic["hidden_salary"] = hidden_salary
    dic['gender'] = gender

    vacancies = (get_vacancies(jsoup))
#     print(vacancies)
    dic["vacancies"] = vacancies

    company = get_company_name(jsoup)
#     print(company)
    dic["company"] = company
    return dic

In [54]:
def JPSoup(browser, href):
    browser.get(href)
    html_content = browser.page_source
    jsoup = bsp(html_content, "html.parser")
    return jsoup

In [55]:
def get_type(soup):
    jtype = soup.find("div", class_ = "css-11rcwxl")
    jtype = jtype.find_all("span", class_ = "css-ja0r8m eoyjyou0")
    return jtype
# print(get_type(jsoup)[0].text)

In [56]:
def get_category(soup):
    category = soup.find("div", class_ = "css-13sf2ik").find("ul", class_ = "css-h5dsne")
    category = category.find_all("span", class_ = "css-158icaa")
    return category

In [57]:
def get_skill(soup):
    skill = soup.find("div", class_ = "css-s2o0yh").find_all("a", class_ = "css-g65o95")
    return skill

In [None]:
def cname(soup):
    company = soup.find('h1', class_ = "css-12s37jy")
    return company.text.strip()
print(cname(jsoup))

In [59]:
def cprofile(soup):
    profile = soup.find('div', id = 'profile-section').find('div').find_all('span', class_ ="css-1xhj18k")
    attributes = ['Location', 'Founded', 'Industry', 'Company Size']
    data = {}
    sector = []
    for p in profile:
        box = p.select('span')
        att = box[0].text.strip()[:-1]
        if att == 'Location':
            info = box[1].text.strip().split(',')
            country = info[-1].strip()
            if len(info) > 1:
                city = info[-2].strip()
            else:
                city = None
            data['country'] = country
            data['city'] = city
            
        elif att == 'Founded':
            info = box[1].text.strip()
            year = str(info)
            data['found_date'] = year + '-01-01'
            
        elif att == 'Company Size':
            info = box[1].text.split(' ')
            numbers = []
            for i in info:
                match = re.search(r'\s*\d+\s*', i)
                if match:
                    number_str = match.group().strip()  # Remove leading/trailing spaces
                    number = int(number_str)
                    numbers.append(number)
            if len(numbers) > 1:  
                size_min = numbers[0]
                size_max = numbers[1]
            elif re.compile(r'\b\s*more\s*\b', re.IGNORECASE).search(info[0]):
                size_min = numbers[0]
                size_max = None
            elif re.compile(r'\b\s*less\s*\b', re.IGNORECASE).search(info[0]):
                size_min = None
                size_max = numbers[0]
            else:
                size_min = None
                size_max = None
            data['size_min'] = size_min
            data['size_max'] = size_max
        
        elif att == "Industry":
            info = box[1].select('a')
            for i in info:
                sector.append(i.text)
    return (data, sector)
print(cprofile(jsoup))

({'country': 'Egypt', 'city': 'Cairo', 'found_date': '2008-01-01', 'size_min': None, 'size_max': None}, ['Computer Software', 'Information Technology Services'])


In [60]:
def cdetails(soup):
    details = soup.find('div', id = 'profile-section')
    if len(details):
        details = details.find('p')
        if details:
            return details.text
    return None
print(cdetails(jsoup))

ICIS is an Information technology company in Cairo/ Egypt and it is a branch of ICIS Company in Riyadh/ Saudi Arabia which serving the Saudi market since 2008,
It provides enterprise software solutions, IT consulting, Systems Security Management, Data Masking, and many other IT products and services,
ICIS is the Golden Oracle Partner. 


In [62]:
page_number = 0
concat = ""
page_link = "https://wuzzuf.net/search/jobs/?a=hpb&filters%5Broles%5D%5B0%5D=IT%2FSoftware%20Development&q="
links = set()
link_name = {}
link_name['company'] = 'Confidential'
dic_company = {}
dic_company['company'] = 'Confidential'
dic_sector = {}
errors = []
cerrors = []
terrors = []
caterrors = []
serrors = []

job_posting_data = pd.DataFrame()
job_posting_type_data = pd.DataFrame()
job_posting_category_data = pd.DataFrame()
job_posting_skill_data = pd.DataFrame()

company_data = pd.DataFrame()
company_sector_data = pd.DataFrame()
# fake one for confidential company

for page_number in range(130):
    browser = webdriver.Chrome()
    jobssoup = JPSoup(browser, page_link+concat)
    jobs_locations = jobssoup.select(".css-5wys0k")  
    jobs_links = jobssoup.find_all("h2", class_ = "css-m604qf")


    # jloc = jobs_locations[0]
    # jlink = jobs_links[0]
    # print(jlink, jloc)
    # soup = JPSoup(browser, jlink.a.get('href'))
    # job_title = get_title(soup)
    # print(jsoup.find(class_ = "css-9iujih").find('a').get('href'))

    i = 1
    for jloc, jlink in zip(jobs_locations, jobs_links):
        jloc = jloc.text
        jloc = jloc.split(', ')
        country = jloc[-1][:-1]
        city = jloc[-2]
        if len(jloc) > 2:
            district = jloc[0] 
        else:
            district = "" #NULL

        ##
        print(i)
        print(jlink.a.get('href'))
        links.add(jlink.a.get('href'))
        
        # job_posting_table
        try:
            soup = JPSoup(browser, jlink.a.get('href')) #need to do something with the link
            dic = job_posting(soup)
        except Exception as e:
            print("error in job link: " + jlink.a.get('href'))
            errors.append(jlink.a.get('href'))
            continue
        
        try:
            if dic['company'] != 'Confidential':
                lnk = soup.find(class_ = "css-9iujih").find('a').get('href') #the link of the company
                if lnk not in links:
                    csoup = JPSoup(browser, lnk) #soup for the company
                    company_name = cname(csoup)
                    links.add(lnk) 
                    link_name[lnk] = company_name
                    dic_company['company'] = company_name #or link_name[lnk]
                    dic_company['description'] = cdetails(csoup)
                    company_df = pd.DataFrame(dic_company, index=[0])
                    company_data = pd.concat([company_data, company_df], ignore_index = True)

                    (dic_company, sector) = cprofile(csoup)
                    for s in sector:
                        dic_sector['company'] = company_name #or link_name[lnk]
                        dic_sector['sector'] = s
                        sector_df = pd.DataFrame(dic_sector, index=[0])
                        company_sector_data = pd.concat([company_sector_data, sector_df], ignore_index=True)          


                dic['company'] = link_name[lnk]
        except Exception as ee:
            print("error in company link: " + lnk + "---- when scrapping job link: " + jlink.a.get('href'))
            cerrors.append(lnk)
            continue


        dic['country'] = country
        dic['city'] = city
        dic['district'] = None if district == '' else district

        post_df = pd.DataFrame(dic, index=[0])
        job_posting_data = pd.concat([job_posting_data, post_df], ignore_index=True)


        #job_posting_type
        try:
            
            jtypes = get_type(soup)      
            dic2={}
            dic2['company'] = dic['company'] ## here
            dic2['job_title'] = dic['job_title']
            dic2['country'] = dic['country']
            dic2['city'] = dic['city']
            dic2['district'] = dic['district']
            for jtype in jtypes:
                dic2['job_type'] = jtype.text
                type_df = pd.DataFrame(dic2, index=[0])
                job_posting_type_data = pd.concat([job_posting_type_data, type_df], ignore_index=True)
                
        except Exception as eee:
            print("error in job type with link: " + jlink.a.get('href'))
            terrors.append(jlink.a.get('href'))  


        # job_posting_category
        try:
            del dic2['job_type']
            jcategories = get_category(soup)
            for jcategory in jcategories:
                dic2['job_category'] = jcategory.text
                category_df = pd.DataFrame(dic2, index=[0])
                job_posting_category_data = pd.concat([job_posting_category_data, category_df], ignore_index=True)
            
        except Exception as eeee:
            print("error in job category with link: " + jlink.a.get('href'))
            caterrors.append(jlink.a.get('href'))

        #job_posting_skill
        try:
            del dic2['job_category']
            jskills = get_skill(soup)
            for jskill in jskills:
                dic2['skill'] = jskill.text
                skill_df = pd.DataFrame(dic2, index=[0])
                job_posting_skill_data = pd.concat([job_posting_skill_data, skill_df], ignore_index=True)
        except Exception as eeeee:
            print("error in job skill with link: " + jlink.a.get('href'))
            serrors.append(jlink.a.get('href'))

#         print(dic['company'])
        if i == 15:
            break
        i = i+1
#         time.sleep(3)
    page_number = page_number+1
    concat = "&start="+str(page_number)
    time.sleep(10)



browser.quit()  

1
https://wuzzuf.net/jobs/p/bwe16Trm8JtM-Full-Stack-Developer-Sketch-tech-Alexandria-Egypt
2
https://wuzzuf.net/jobs/p/360mloRE7C3p-Senior-Backend-Developer-NET-Core-Cairo-Egypt
3
https://wuzzuf.net/jobs/p/B13dTQcyln10-Dot-Net-Web-Developer-Tidal-Cairo-Egypt
4
https://wuzzuf.net/jobs/p/pP71wWVJmrE4-Full-Stack-wordpress-Developer-Osolutions-Cairo-Egypt
5
https://wuzzuf.net/jobs/p/N0Nh5pzRst71-Senior-Odoo-Developer---Onsite-Cairo-Egypt
6
https://wuzzuf.net/jobs/p/9LiqyHcnSQ72-Front-End-Developer-Safa-Soft-Cairo-Egypt
7
https://wuzzuf.net/jobs/p/4xPg2JmGPWEf-IT-Clerk-Nahdet-Misr-Publishing-Group-Giza-Egypt
8
https://wuzzuf.net/jobs/p/YX8Gvbm9KdEk-SAP-SD-CX-function-consultant-Nahdet-Misr-Publishing-Group-Giza-Egypt
9
https://wuzzuf.net/jobs/p/OtBNQ02hxdAW-Senior-Backend-Developer-PHP-iStoria-Cairo-Egypt
10
https://wuzzuf.net/jobs/p/m83efEO24ZYE-UX-Designer-Safa-Soft-Cairo-Egypt
11
https://wuzzuf.net/jobs/p/MU7wIRNPCWqm-C-Qt-Developer-Fruitful-Solutions-Cairo-Egypt
12
https://wuzzuf.net/jo

In [68]:
errors

['https://wuzzuf.net/internship/IdwWnYHUoNvp-PHP-Developer-Internship---Remote-Elev8Assessments-Giza-Egypt',
 'https://wuzzuf.net/internship/FfEIWW2gAFYu-Quality-Control-Engineer-Internship-WUZZUF-Cairo-Egypt',
 'https://wuzzuf.net/internship/1qa8eZCttPig-Backend-Developer-Internship-WUZZUF-Cairo-Egypt',
 'https://wuzzuf.net/internship/VR1mvWphjOJY-It-Help-Desk-Intern-Sutherland-Cairo-Egypt']

In [69]:
cerrors

['https://wuzzuf.net/jobs/careers/Nakheel-Egypt-85138']

In [70]:
# for testing
browser = webdriver.Chrome()
browser.get(cerrors[0])
html_content = browser.page_source
jsoup = bsp(html_content, "html.parser")
browser.quit()

In [63]:
job_posting_data.to_csv("job_posting.csv", sep=',', encoding='utf-8', index=False)
job_posting_data

Unnamed: 0,job_title,description,requirements,post_date,viewed,considered,experience_min,experience_max,career_level,education_level,salary_min,salary_max,hidden_salary,gender,vacancies,company,country,city,district
0,Full Stack Developer,full stack Developer using php 3+Yrs of Exp We...,,2023-11-01,,,1,2,Experienced (Non-Manager),,,,True,,1,Sketch-tech,Egypt,Alexandria,Glim
1,Senior Backend Developer (.NET Core),"Review, test and debug team members’ code Des...",Work experience as a Senior Java Developer or ...,2023-11-01,,,5,7,Experienced (Non-Manager),,,,True,,1,Confidential,Egypt,Cairo,
2,Dot Net Web Developer,Handle web-based systems Work on Microsoft sta...,"Bachelor Degree in Computer Sciences, Software...",2023-11-01,,,2,,Entry Level (Junior Level / Fresh Grad),Bachelor's Degree,,,True,M,5,Tidal,Egypt,Cairo,
3,Full Stack wordpress Developer,We Are looking for Experienced & professional ...,Proven experience in full-stack development wi...,2023-11-01,,,2,5,Experienced (Non-Manager),,,,True,,1,Osolutions,Egypt,Cairo,Sheraton
4,Senior Odoo Developer - (Onsite),Responsibilities : Understand customer require...,5-7 years of work experience of Odoo/ Open ERP...,2023-11-01,,,7,10,Experienced (Non-Manager),Bachelor's Degree,80000,100000,False,,3,Confidential,Egypt,Cairo,New Nozha
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1940,Digital Sales Executive,Establish and maintain current client and pote...,Experience in outdoor sales in any service fie...,2023-09-01,29,0,1,2,Entry Level (Junior Level / Fresh Grad),Bachelor's Degree,,,True,,4,Be Group,Egypt,Cairo,Nasr City
1941,Service Desk Engineer,Testing and analyzing IT system and software p...,"Bachelor's degree in computer science, informa...",2023-09-01,5,0,2,3,Experienced (Non-Manager),,,,True,,1,ITS,Egypt,Cairo,Nasr City
1942,Quality Assurance Specialist,Regarding Internal Audit: To follow up with au...,Organizational and planning Communication Exc...,2023-09-01,0,0,2,5,Experienced (Non-Manager),,,,True,,1,ITS,Egypt,Cairo,Nasr City
1943,IT Help Desk Coordinator,Serve as the first point of contact for employ...,"Bachelor’s degree in computer science, compute...",2023-09-01,72,24,1,3,Entry Level (Junior Level / Fresh Grad),Bachelor's Degree,,,True,,1,FlexFilms Egypt,Egypt,Giza,6th of October


In [64]:
job_posting_type_data.to_csv("job_posting_type.csv", sep=',', encoding='utf-8', index=False)
job_posting_type_data

Unnamed: 0,company,job_title,country,city,district,job_type
0,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,Full Time
1,Confidential,Senior Backend Developer (.NET Core),Egypt,Cairo,,Full Time
2,Tidal,Dot Net Web Developer,Egypt,Cairo,,Full Time
3,Tidal,Dot Net Web Developer,Egypt,Cairo,,Shift Based
4,Osolutions,Full Stack wordpress Developer,Egypt,Cairo,Sheraton,Full Time
...,...,...,...,...,...,...
2079,Be Group,Digital Sales Executive,Egypt,Cairo,Nasr City,Full Time
2080,ITS,Service Desk Engineer,Egypt,Cairo,Nasr City,Full Time
2081,ITS,Quality Assurance Specialist,Egypt,Cairo,Nasr City,Full Time
2082,FlexFilms Egypt,IT Help Desk Coordinator,Egypt,Giza,6th of October,Full Time


In [65]:
job_posting_category_data.to_csv("job_posting_category.csv", sep=',', encoding='utf-8', index=False)
job_posting_category_data

Unnamed: 0,company,job_title,country,city,district,job_category
0,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,IT/Software Development
1,Confidential,Senior Backend Developer (.NET Core),Egypt,Cairo,,IT/Software Development
2,Confidential,Senior Backend Developer (.NET Core),Egypt,Cairo,,Engineering - Telecom/Technology
3,Tidal,Dot Net Web Developer,Egypt,Cairo,,IT/Software Development
4,Tidal,Dot Net Web Developer,Egypt,Cairo,,Engineering - Telecom/Technology
...,...,...,...,...,...,...
4569,FlexFilms Egypt,IT Help Desk Coordinator,Egypt,Giza,6th of October,Installation/Maintenance/Repair
4570,FlexFilms Egypt,IT Help Desk Coordinator,Egypt,Giza,6th of October,IT/Software Development
4571,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,IT/Software Development
4572,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,Quality


In [71]:
job_posting_skill_data.to_csv("job_posting_skill.csv", sep =',', encoding ='utf-8', index=False)
job_posting_skill_data

Unnamed: 0,company,job_title,country,city,district,skill
0,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,Full Stack
1,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,Full Stack Developer
2,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,Computer Science
3,Sketch-tech,Full Stack Developer,Egypt,Alexandria,Glim,Software Development
4,Confidential,Senior Backend Developer (.NET Core),Egypt,Cairo,,Software Development
...,...,...,...,...,...,...
23621,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,Software
23622,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,Quality Control
23623,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,Information Technology (IT)
23624,Arabic Localizer,Software Analyst/Tester (Fresh) - Alexandria,Egypt,Alexandria,San Stefano,Computer Science


In [66]:
company_data.to_csv("company.csv", sep=',', encoding='utf-8', index=False)
company_data

Unnamed: 0,company,description,country,city,found_date,size_min,size_max
0,Sketch-tech,,,,,,
1,Tidal,Tidal is one of the Software Company. Its main...,Egypt,,,,
2,Osolutions,We Help businesses to sell more & more… \n\nAl...,Egypt,Cairo,2010-01-01,,
3,Safa Soft,Safa Soft.\n\nSafa Soft is a multinational com...,Egypt,Cairo,,,
4,Nahdet Misr Publishing Group,Nahdet Misr Publishing House was established b...,Egypt,Cairo,2008-01-01,,
...,...,...,...,...,...,...,...
690,Advansys for Trading & Contracting,Advansys trading and contracting was establish...,Egypt,Cairo,2006-01-01,,
691,URANIUM,"URANIUM is a Software House company, specializ...",Egypt,Cairo,2001-01-01,,
692,Dreem Mashreq Foods,"Dreem, was founded in the late 70s under the n...",Egypt,Cairo,2007-01-01,,
693,Be Group,"Founded in 2009, Be Group has showed a fast gr...",Egypt,Alexandria,,,


In [67]:
company_sector_data.to_csv("company_sector.csv", sep=',', encoding='utf-8', index=False)
company_sector_data

Unnamed: 0,company,sector
0,Tidal,Computer Software
1,Osolutions,Marketing and Advertising
2,Safa Soft,Marketing and Advertising
3,Safa Soft,Computer Software
4,Safa Soft,Information Technology Services
...,...,...
754,Dreem Mashreq Foods,FMCG
755,Be Group,Marketing and Advertising
756,FlexFilms Egypt,Manufacturing
757,FlexFilms Egypt,Packaging and Containers


In [82]:
%%capture
pip install faker

In [88]:
from faker import Faker
import random

In [89]:
faker = Faker()
district_names = ["Downtown", "Uptown", "Central", "West End", "East Side", "Riverside"]



In [125]:
## user
user_dic = {}
user_data = pd.DataFrame()
for _ in range(500):
    user_dic['email'] = faker.email()
    user_dic['username'] = faker.name()
    user_dic['password'] = faker.password()
    user_dic['country'] = faker.country()
    user_dic['city'] = faker.city() 
    user_dic['district'] = random.choice(district_names)
    user_dic['experience'] = faker.random_int()%17
    user_dic['no_experience'] = random.choice([0, 1])
    user_dic['GPA'] = round(random.uniform(0.0, 4.0), 2)
    user_dic['phone_number'] = faker.phone_number()
    BOD = faker.date_of_birth()
    user_dic['BOD'] = BOD.strftime("%Y, %m, %d")
    user_dic['gender'] = random.choice(['M', 'F'])
    user_dic['first_name'] = faker.first_name()
    user_dic['middle_name'] = faker.last_name()
    user_dic['last_name'] = faker.last_name()
    user_df = pd.DataFrame(user_dic, index=[0])
    user_data = pd.concat([user_data, user_df], ignore_index=True)
    

In [124]:
# This is for generating insert statements for user_skill table
email_addresses = user_data.iloc[:, 0]
email_list = email_addresses.tolist()

skills = ['Computer Science', 'Information Technology (IT)', 'Software', 'Software Development', 'Software Engineering']
for email in email_list:
    for skill in skills:
        sql = 'INSERT IGNORE INTO user_skill(email, skill) VALUES("' + email +'", "' + skill +'");'
        print(sql)
    

INSERT IGNORE INTO user_skill(email, skill) VALUES("michaelhahn@example.com", "Computer Science");
INSERT IGNORE INTO user_skill(email, skill) VALUES("michaelhahn@example.com", "Information Technology (IT)");
INSERT IGNORE INTO user_skill(email, skill) VALUES("michaelhahn@example.com", "Software");
INSERT IGNORE INTO user_skill(email, skill) VALUES("michaelhahn@example.com", "Software Development");
INSERT IGNORE INTO user_skill(email, skill) VALUES("michaelhahn@example.com", "Software Engineering");
INSERT IGNORE INTO user_skill(email, skill) VALUES("stephen40@example.org", "Computer Science");
INSERT IGNORE INTO user_skill(email, skill) VALUES("stephen40@example.org", "Information Technology (IT)");
INSERT IGNORE INTO user_skill(email, skill) VALUES("stephen40@example.org", "Software");
INSERT IGNORE INTO user_skill(email, skill) VALUES("stephen40@example.org", "Software Development");
INSERT IGNORE INTO user_skill(email, skill) VALUES("stephen40@example.org", "Software Engineering")

In [126]:
user_data.to_csv('user.csv', sep =',', encoding ='utf-8', index=False)
user_data

Unnamed: 0,email,username,password,country,city,district,experience,no_experience,GPA,phone_number,BOD,gender,first_name,middle_name,last_name
0,austinjesse@example.net,Colleen Vaughan,dm25GYGf@),Ecuador,Lake Jamesberg,Downtown,6,1,3.53,929.395.5255,"1945, 09, 27",F,Nicholas,Ochoa,Williams
1,onealjesse@example.org,Steven Carlson,Scd1FzfO)i,Saudi Arabia,South Keith,Central,11,0,1.06,+1-585-634-4686,"1980, 12, 30",F,Douglas,Johns,Weber
2,tylerwatson@example.org,Dylan Rios,ZEVvBvaJ)9,French Polynesia,South Johnny,Uptown,15,0,1.56,+1-483-250-1802x168,"1984, 02, 27",F,Jesus,Henson,Lee
3,patellaura@example.org,Jeremy Welch,(O6aT!TcdA,Jersey,Romanchester,Downtown,11,0,3.17,387-342-6374x40705,"1945, 05, 01",F,Albert,Warren,Collins
4,tinabarron@example.com,Charles Gonzalez,(E1XjAlx4$,Burkina Faso,Lake Davidberg,Uptown,3,0,3.61,278-702-6607x3389,"1999, 05, 27",M,Danielle,Morris,Vincent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,daniel48@example.com,Troy Cooper,o_1QQjhiTz,El Salvador,Weberville,Downtown,1,1,1.98,9729361280,"2001, 09, 15",F,Scott,Mcdowell,Barrett
496,laurabond@example.com,Jessica Harris,$7HbygAnEQ,Guinea,South Jacob,East Side,11,0,1.89,+1-454-367-7366x3394,"1919, 04, 28",M,Nathan,Lee,Silva
497,feliciamiller@example.com,Michael Knight,#+jz0gKbf),Liechtenstein,Vanessamouth,Uptown,2,0,3.25,+1-983-670-5781,"1954, 07, 07",M,Alicia,Foster,Bowman
498,shannonmichelle@example.net,Shawn Cole,$1KTc*v8Pl,Eritrea,Mitchellfurt,Riverside,5,1,3.42,001-958-833-2365x5677,"2023, 08, 06",F,Juan,Glass,Peters


In [118]:
user_skill_data.to_csv('user_skill.csv', sep =',', encoding ='utf-8', index=False)
user_skill_data

Unnamed: 0,skill,email
0,Computer Science,michaelhahn@example.com
1,Software Engineering,stephen40@example.org
2,Software Engineering,rachaelhunt@example.net
3,Software Engineering,nicholas45@example.com
4,Computer Science,rubioalexander@example.com
...,...,...
95,Software Development,ddiaz@example.net
96,Software,bbailey@example.net
97,Software,michaelsmith@example.org
98,Information Technology (IT),kevin48@example.org
