# Freelancer Scraper

In [10]:
#pip install --upgrade selenium
#!pip install webdriver-manager

In [19]:
#Import the necessary packages needed to build the freelancer bot and navigate the pages
import re
import pandas as pd
from time import sleep
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [279]:
class FreelancerScraper:
    
    def __init__(self,num_pages,job_or_field = None,url= None):
        
        #Check if a job or webpage to extract from was inputted
        if (job_or_field is None) and (url is None):
            raise ValueError('Please provide one of job_or_field or url for scraping')
            
            
        #Store the number of pages to extract from
        self.num_pages = num_pages
        
        #Note the job to scrape on and the job field
        if url:
            self.url = url
            self.job_or_field = self.url.split('/')[-1]
        
        elif job_or_field:
            self.job_or_field = job_or_field
            self.url = f'https://www.freelancer.com/freelancers/united-states/{self.job_or_field}'
            
        #List to store the scraped user profile links
        self.user_profiles = []
        
        #List to store scraped user profile info
        self.users_info = []
        
        
    def get_usernames(self):
        
        #Create browser instance
        self.browser = webdriver.Chrome(ChromeDriverManager().install())
        
        #Get the starting webpage and wait sometime for elements to load
        self.browser.get(self.url)
        sleep(3)
        
        #See if online selected, and deselect. Also wait for sometime for elements to load
        try:
            self.browser.find_element(By.ID,"selected-online").click()
            sleep(3)
        except:
            pass
        
        #Click the next button num_pages times
        for _ in range(self.num_pages):
            
            #Get the profile links for everyone on the page
            usernames = self.browser.find_elements(By.CLASS_NAME,"find-freelancer-username")

            for user in usernames:
                self.user_profiles.append(user.get_attribute('href'))
                
            #NAVIGATION TO NEXT PAGE
            
            #Find all the page navigation buttons
            pg_nav_btns = self.browser.find_elements(By.CSS_SELECTOR,"a[data-target='pagination']")
            
            #Find and click on the last one (next page button) 
            pg_nav_btns[-1].click()
            sleep(3)
        
        #Close the browser when done
        self.browser.quit()  
   
    def extract_all(self):
        
        """
        Multi-page username and user profile link scraping:
        """

        #Variable to track if the page iterations should be terminated
        terminate = False

        #Start search from page 1
        pg = 1
        
        #Iterate through pages until no more results found
        while not terminate:
            
            self.browser.get(self.url+'/'+str(pg))
            sleep(3)
        
        
            #See if online selected, and deselect
            browser.find_element(By.ID,"selected-online").click()
            sleep(3)

            #Get the profile links for everyone on the page
            usernames = browser.find_elements(By.CLASS_NAME,"find-freelancer-username")

            for user in usernames:
                user_profiles.append(user.get_attribute('href'))

        
        
    def get_user_profile_info(self,user_profile_link):
        
        """
        Scrapes and inputted user profile link
        """
        
        self.browser = webdriver.Chrome(ChromeDriverManager().install())
        self.browser.get(user_profile_link)
        sleep(3)
        
        #Dictionary to store the user's info
        user_info = {}
        
        #Store the job_or_field
        user_info['job_or_field'] = self.job_or_field

        #Get name
        user_info['name'] = self.browser.find_element(By.CSS_SELECTOR,"h3[data-color='dark']").text
        
        #Store the profile link
        user_info['profile_link'] = user_profile_link

        #Get tagline
        user_info['tagline'] = self.browser.find_element(By.CSS_SELECTOR,"h2[data-color='mid']").text

        ###### Get description
        description_box = self.browser.find_element(By.CSS_SELECTOR,"fl-text[data-max-lines='15']")

        #Try to click on the read more button and extract text
        try:
            description_box.find_element(By.CLASS_NAME,'ReadMoreButton').click()
            user_info['user_description'] = description_box.text
        #If error is thrown because text doesn't need to be expanded, then we can just pull direct text 
        except:
            user_info['user_description'] = description_box.text
        ######################    

        #Get location
        user_info['location'] = self.browser.find_element(By.CSS_SELECTOR,"fl-col[class='SupplementaryInfo']").text

        #Get join date
        user_info['join_date'] = self.browser.find_element(By.XPATH, "//*[contains(text(),'Joined')]").text.replace('Joined ','')

        #Hourly rate
        user_info['hourly_rate'] = re.findall('\$\d+',self.browser.find_element(By.XPATH, "//*[contains(text(),'USD')]").text)[0]

        #Get pay grade
        user_info['pay_grade'] = self.browser.find_element(By.CSS_SELECTOR,"div[data-size='xxsmall").text

        #######Get rating container
        rating_container = self.browser.find_elements(By.CSS_SELECTOR,"fl-bit[class ='RatingContainer']")[1]
        

        #Get average rating and number of reviews
        user_info['avg_rating'], reviews = rating_container.text.split('\n')
        user_info['num_reviews'] = re.findall('\d+',reviews)[0]
        ##################################

        #Get number of recommendations
        user_info['num_recommendations']=re.findall('\d+',self.browser.find_element(By.CSS_SELECTOR,"fl-col[class='RecommendationsText']").text)[0]

        ### GET STATS ON PERFORMANCE
        pct_jobs_completed, pct_on_budget, pct_on_time, repeat_hire_rate = [item.text.\
                                                                            replace('%','') for item in self.browser.\
                                                                            find_elements(By.CSS_SELECTOR,"fl-text[class='ReputationItemAmount']")]
        
        user_info['pct_jobs_completed'] = pct_jobs_completed
        user_info['pct_on_budget'] = pct_on_budget
        user_info['pct_on_time'] = pct_on_time
        user_info['repeat_hire_rate'] = repeat_hire_rate
        ##############################

        #Add the extracted info the list for all users
        self.users_info.append(user_info)
        
        #Close the browser when done
        self.browser.close()  
        
    
    def run(self):
        
        #Extract all the usernames for the particular role
        self.get_usernames()
        
        #Get all the info for each of the usernames
        for url in self.user_profiles:
            self.get_user_profile_info(url)
            sleep(4)
    
        #Return all the scraped data
        return self.users_info



        
        

rating_container = self.browser.find_element(By.CSS_SELECTOR,"span[class='IconContent']")

In [280]:
finance_da_url = 'https://www.freelancer.com/freelancers/united-states/finance-data-analytics'
scraper = FreelancerScraper(num_pages=6,url= finance_da_url)
scraped_df = scraper.run()



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.

Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.

In [282]:
scraped_pd_df = pd.DataFrame(scraped_df)
scraped_pd_df.head()

Unnamed: 0,job_or_field,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,repeat_hire_rate
0,finance-data-analytics,Michael S.,https://www.freelancer.com/u/stallonem,Experienced Market Researcher,I have over 12 years of experience helping bra...,"Wallkill, United States","December 7, 2020",$115,7.2,5.0,1,2,100,100,100,100.0
1,finance-data-analytics,Teena V.,https://www.freelancer.com/u/TeenaVernekar,Virtual Assistant - Financial Accounting & Bil...,I am self-motivated and precise individual wit...,"Charlotte, United States","September 11, 2017",$15,7.8,5.0,7,8,100,100,100,33.0
2,finance-data-analytics,Tatiana L.,https://www.freelancer.com/u/TatianaLLL,"US Taxes, Bookkeeping, IRS Tax Debt Resolution.","Hello, my name is Tatiana Loughman & I'm:\n - ...","Heath, United States","October 27, 2017",$99,5.7,5.0,28,3,100,100,95,20.0
3,finance-data-analytics,Megan M.,https://www.freelancer.com/u/meganimcauliffe,Data Analyst/Atmospheric Scientist,Skilled data analyst and R programmer for vari...,"Willmar, United States","October 11, 2021",$15,2.8,5.0,1,0,100,100,100,
4,finance-data-analytics,Helen Y.,https://www.freelancer.com/u/hyu289,Front End Developer,I have three years of working experience in le...,"Madison, United States","November 7, 2019",$30,4.0,5.0,7,0,100,100,100,33.0


In [283]:
scraped_pd_df.to_csv('finance-data-analytics.csv',index=False)

# Test Runs

In [216]:
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get('https://www.freelancer.com/freelancers/united-states/finance-data-analytics')
sleep(3)

#See if online selected, and deselect
browser.find_element(By.ID,"selected-online").click()
sleep(3)

browser.find_elements(By.CSS_SELECTOR,"a[data-target='pagination']")



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


[<selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="88900fab-f10c-439c-b600-308c7129d20d")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="851fffee-d1bb-4e0e-8b0e-2207019e8f4e")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="c1809a49-fad7-485a-9b93-aac18e249884")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="4fa2ac52-c5b1-43a1-9e8e-48c096329f58")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="0a820fa3-f6a7-4c46-ba09-1b4b9f7132eb")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="0e664080-ea96-4334-9541-928cdcce4aab")>,
 <selenium.webdriver.remote.webelement.WebElement (session="2d297fcd75185b2995af615ffb0c719a", element="05e8ab99-ba07-4a3b-a22f-4f

In [222]:
for i in range(3-):
    print(i)

0
1
2


In [218]:
browser.find_elements(By.CSS_SELECTOR,"a[data-target='pagination']")[-1].click()

In [None]:
brow

In [22]:
user_profiles = []

browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get('https://www.freelancer.com/freelancers/united-states/finance-data-analytics')
sleep(3)

#See if online selected, and deselect
browser.find_element(By.ID,"selected-online").click()
sleep(3)


#Get the profile links for everyone on the page
usernames = browser.find_elements(By.CLASS_NAME,"find-freelancer-username")

for user in usernames:
    user_profiles.append(user.get_attribute('href'))




Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


In [23]:
user_profiles

['https://www.freelancer.com/u/stallonem',
 'https://www.freelancer.com/u/TeenaVernekar',
 'https://www.freelancer.com/u/TatianaLLL',
 'https://www.freelancer.com/u/meganimcauliffe',
 'https://www.freelancer.com/u/hyu289',
 'https://www.freelancer.com/u/Nsarker1242',
 'https://www.freelancer.com/u/marcus01001',
 'https://www.freelancer.com/u/michaelhanley00',
 'https://www.freelancer.com/u/palwasham8',
 'https://www.freelancer.com/u/LeahAnson']

In [251]:
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get('https://www.freelancer.com/u/TatianaLLL')
sleep(3)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


In [252]:
#Get rating container
rating_container = browser.find_element(By.CSS_SELECTOR,"span[class='IconContent']")

#Get average rating and number of reviews
avg_rating, reviews = rating_container.text.split('\n')
num_reviews = re.findall('\d+',reviews)[0]

ValueError: not enough values to unpack (expected 2, got 1)

In [277]:
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get('https://www.freelancer.com/u/michaelhanley00')
sleep(3)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/koredeakande/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


In [278]:
browser.find_elements(By.CSS_SELECTOR,"fl-bit[class ='RatingContainer']")[1].text.split('\n')

['5.0', '(16 reviews)']

In [270]:
a = browser.find_element_by_xpath("//span[contains(text(),'reviews')]")
a.find_element(By.CSS_SELECTOR,"fl-bit[class ='ValueBlock ng-star-inserted']").text

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"fl-bit[class ='ValueBlock ng-star-inserted']"}
  (Session info: chrome=98.0.4758.109)


In [264]:
a = browser.find_element(By.CSS_SELECTOR,"fl-rating[data-type='stars']")
a.find_element(By.CSS_SELECTOR,"fl-bit[class ='ValueBlock ng-star-inserted']").text

''

In [198]:
user_info = {}

#Get name
name = browser.find_element(By.CSS_SELECTOR,"h3[data-color='dark']").text

#Get tagline
tagline = browser.find_element(By.CSS_SELECTOR,"h2[data-color='mid']").text

###### Get description
description_box = browser.find_element(By.CSS_SELECTOR,"fl-text[data-max-lines='15']")

#Try to click on the read more button and extract text
try:
    description_box.find_element(By.CLASS_NAME,'ReadMoreButton').click()
    user_description = description_box.text
#If error is thrown because text doesn't need to be expanded, then we can just pull direct text 
except:
    user_description = description_box.text
######################    

#Get location
location = browser.find_element(By.CSS_SELECTOR,"fl-col[class='SupplementaryInfo']").text

#Get join date
join_date = browser.find_element(By.XPATH, "//*[contains(text(),'Joined')]").text.replace('Joined ','')

#Hourly rate
hourly_rate = re.findall('\$\d+',browser.find_element(By.XPATH, "//*[contains(text(),'USD')]").text)[0]

#Get pay grade
pay_grade = browser.find_element(By.CSS_SELECTOR,"div[data-size='xxsmall").text

#Get rating container
rating_container = browser.find_element(By.CSS_SELECTOR,"span[class='IconContent']")

#Get average rating and number of reviews
avg_rating, reviews = rating_container.text.split('\n')
num_reviews = re.findall('\d+',reviews)[0]

#Get number of recommendations
num_recommendations=re.findall('\d+',browser.find_element(By.CSS_SELECTOR,"fl-col[class='RecommendationsText']").text)[0]

### GET STATS ON PERFORMANCE
pct_jobs_completed, pct_on_budget, pct_on_time, repeat_hire_rate = [item.text.\
                                                                    replace('%','') for item in browser.\
                                                                    find_elements(By.CSS_SELECTOR,"fl-text[class='ReputationItemAmount']")]
##############################








'2'

['100', '100', '100', '100']

In [188]:
user_description

'I have over 12 years of experience helping brands make informed business decisions. My clients have ranged from small businesses to major global brands like L’Oréal, Marriott, Colgate, Citibank, and Walmart. Across all clients, my job has always been to feed data & insights into every decision, ensuring that a brand’s efforts are effective, efficient, measurable, and always improving.\n\nWhile my background has taught me a breadth of invaluable technical skills, my true passion lies in the art and the people side of market research. I love to transform a complex dataset into a clear narrative, to help a client address a challenge by designing a custom methodology, to unlock a deeper or unexpected human insight that helps inspire a creative idea. These are the things that make me tick, and throughout my career, it has been this passion to collaborate and my relentless pursuit of solutions that has set me apart from others in my field.\n\nMy skillset includes the following:\n- Writing, 

In [161]:
num_reviews

'1'

In [150]:
rating_container.find_element_by_xpath("/span")

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/span"}
  (Session info: chrome=98.0.4758.109)


<selenium.webdriver.remote.webelement.WebElement (session="bb0b74bc1911489eb664a8a111533d1b", element="459beca7-34c6-463d-8028-b60fb6624639")>

In [109]:
re.findall('\\d+',browser.find_element_by_xpath(recommendations_xpath).text)

['2']

In [None]:
ids = driver.find_elements_by_xpath('//*[@id]')
for ii in ids:
    #print ii.tag_name
    print ii.get_attribute('id')    # id name as string

In [None]:
find_element_by_tag_name()