# Freelancer Scraper - search by keyword

**Goal**: scrape results from Freelancer.com when using a specific search query.

In [None]:
#Import the necessary packages needed to build the freelancer bot and navigate the pages
import re
import pandas as pd
from time import sleep
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [None]:
class FreelancerScraper:
    """
    Class that scrapes for talent listings on Freelancer.com based on a
    specified search query.
    """

    def __init__(self,search=None, max_rows=float("inf")):
        """
        Initializes the scraper.

        :search: -- the search term to use when searching for listings.
        :max_rows: -- the maximum number of rows to fetch (defaults to infinity, which
                      fetches the total number of rows available)
        """
        
        # check if a search term was provided
        if search is None:
            raise ValueError('Please provide a search term for scraping.')
        
        self.search = search

        # focusing only on freelancers in the United States
        self.url = f'https://www.freelancer.com/freelancers/united-states/'

        # sets a hard limit on the amount of rows
        # the scraper should fetch. This is needed
        # sometimes when there are a lot of results
        # and you don't want the scraper
        self.max_rows = max_rows
            
        #List to store the scraped user profile links
        self.user_profiles = set([])
        
        #List to store scraped user profile info
        self.users_info = []
        
        
    def get_all_usernames(self):
        """
        Fetches all usernames (and profile links) associated with the search query.
        It stores the retrieved user profile links in self.user_profiles.
        """
        
        print("Fetching usernames...")

        # options for selenium that allow it to run
        # in DeepNote without having to open any physical
        # Chrome tabs
        options = Options()
        options.add_argument("--headless")
        options.add_argument('--no-sandbox')

        # create browser instance
        self.browser = webdriver.Chrome(options=options)
        
        # get the starting webpage and wait sometime for elements to load
        self.browser.get(self.url)
        sleep(3)
        
        # see if the online filter is selected, and deselect. Also wait for sometime for elements to load
        try:
            self.browser.find_element(By.ID,"selected-online").click()
            sleep(3)
        except:
            pass

        # input our search term into the search bar
        # and press ENTER to run the search
        search_input = self.browser.find_element(By.ID, "freeSearchInput")
        search_input.clear()
        search_input.send_keys(self.search)
        search_input.send_keys(Keys.RETURN)

        # wait for results
        sleep(3)

        # get the total results found (to calculate a stopping point)
        result_amount_text = self.browser.find_element(By.CLASS_NAME, "result-amount").text
        result_amount = int(result_amount_text.split(" ")[1])

        print("Found", result_amount, "total results.")

        usernames_scraped = 0
        
        # while we still have usernames left to scrape, continue retrieving info
        while usernames_scraped < result_amount and usernames_scraped < self.max_rows:
            
            # get the profile links for everyone on the page
            usernames = self.browser.find_elements(By.CLASS_NAME,"find-freelancer-username")
            
            for user in usernames:
                usernames_scraped += 1

                # add username to list of profiles
                # to fetch later
                self.user_profiles.add(user.get_attribute('href'))
            
            print("Usernames fetched:", usernames_scraped)

            # NAVIGATION TO NEXT PAGE
            
            # find all the page navigation buttons
            pg_nav_btns = self.browser.find_elements(By.CSS_SELECTOR,"a[data-target='pagination']")
            
            # find and click on the last one (next page button) 
            pg_nav_btns[-1].click()

            # wait for next page to load
            sleep(3)
        
        #Close the browser when done
        self.browser.quit()  
        
        
    def get_user_profile_info(self,user_profile_link):
        """
        Scrapes an inputted user profile link
        """
        
        print("Getting profile information from", user_profile_link, "...")

        # options that allow selenium to run on DeepNote
        options = Options()
        options.add_argument("--headless")
        options.add_argument('--no-sandbox')
        
        try:

            self.browser = webdriver.Chrome(options=options)

            # get profile page
            self.browser.get(user_profile_link)
            sleep(2)
        
            # dictionary to store the user's info
            user_info = {}
            
            # store the search query used
            user_info['search_query'] = self.search

            # get name
            user_info['name'] = self.browser.find_element(By.CSS_SELECTOR,"h3[data-color='dark']").text
            
            # store the profile link
            user_info['profile_link'] = user_profile_link
            
            
            ################## GET FREELANCER BADGES
            try:
                # get the badges
                badge_container = self.browser.find_element(By.CSS_SELECTOR,"fl-bit[class ='NameContainer-badges']")
                badges = [x.get_attribute('data-type') for x in badge_container.find_elements(By.CSS_SELECTOR,"fl-badge")]

                # replace membership with plus_membership
                badges = ['plus-membership' if x=='membership' else x for x in badges] 
            except:
                badges = []

            if not badges: badges = []

            user_info['freelancer_badges'] = badges   
            #####################################################################################


            ################## GET VERIFICATIONS
            verifications_list = self.browser.find_element(By.CSS_SELECTOR,
                                                           "app-user-profile-verifications")

            ver_items = verifications_list.find_elements(By.CSS_SELECTOR,
                                                         "fl-bit[class='ProfileVerificationItem-label']")

            verification_text = [x.text for x in ver_items]

            verification_symbs = [x.find_element(By.CSS_SELECTOR,"div[class ='IconContainer']").\
                                    get_attribute('data-color') for x in ver_items]

            verification_status = [True if symb=='success' else False for symb in verification_symbs]

            verifications = [*zip(verification_text,verification_status)]

            user_info['verifications'] = verifications  
            #####################################################################################

            # get tagline
            user_info['tagline'] = self.browser.find_element(By.CSS_SELECTOR,"h2[data-color='mid']").text

            # get description
            description_box = self.browser.find_element(By.CSS_SELECTOR,"fl-text[data-max-lines='15']")

            # try to click on the read more button and extract text
            try:
                description_box.find_element(By.CLASS_NAME,'ReadMoreButton').click()
                user_info['user_description'] = description_box.text
            # if error is thrown because text doesn't need to be expanded, then we can just pull direct text 
            except:
                user_info['user_description'] = description_box.text

            ################## GET CERTIFICATIONS
            try:
                certifications_list = self.browser.find_element(By.CSS_SELECTOR,"app-user-profile-exams")

                cert_items = certifications_list.find_elements(By.CSS_SELECTOR,
                                                            "fl-bit[class='UserProfileExams-item ng-star-inserted']")
                
                certifications = [tuple(x.text.split('\n')) for x in cert_items]
                
            except:
                certifications = []

            if not certifications: certifications = [] 

            user_info['certifications'] = certifications

            #####################################################################################
            
            
            ################################# EXTRACTING TOP SKILLS

            #1. Expanding list to get all top skills

            ####tracker for view_more top skills
            view_more_button = True

            ####Click view more button until all top_skills appear
            while view_more_button:
                try:
                    self.browser.find_element(By.XPATH, "//button[text()=' View More ']").click()
                    sleep(0.5)

                #If error is thrown because view more button disappears – stop trying to click
                except:
                    view_more_button = False

            #2. Extracting the top skills

            ####List to store all top skills
            top_skills = []

            ####Iterate through all listed skills
            for skill in self.browser.find_elements(By.CSS_SELECTOR,"fl-bit[class ='UserProfileSkill']"):

                #Get the skill
                skill_text = skill.text

                #If a number is indicated in the skill
                if '\n' in skill_text:

                    #Note skill and count
                    sk,ct = skill_text.split('\n')       
                    top_skills.append((sk,int(ct)))

                #Note count as 1
                else:
                    top_skills.append((skill_text,0))
            
            #Store user top skills
            user_info['top_skills'] = top_skills
            ###########################################################################################

            #Get location
            user_info['location'] = self.browser.find_element(By.CSS_SELECTOR,"fl-col[class='SupplementaryInfo']").text

            #Get join date
            user_info['join_date'] = self.browser.find_element(By.XPATH, "//*[contains(text(),'Joined')]").text.replace('Joined ','')

            #Hourly rate
            user_info['hourly_rate'] = re.findall('\$\d+',self.browser.find_element(By.XPATH, "//*[contains(text(),'USD')]").text)[0]

            #Get pay grade
            user_info['pay_grade'] = self.browser.find_element(By.CSS_SELECTOR,"div[data-size='xxsmall").text

            #######Get rating container
            rating_container = self.browser.find_elements(By.CSS_SELECTOR,"fl-bit[class ='RatingContainer']")[1]
            

            #Get average rating and number of reviews
            user_info['avg_rating'], reviews = rating_container.text.split('\n')
            user_info['num_reviews'] = re.findall('\d+',reviews)[0]
            ##################################

            #Get number of recommendations
            user_info['num_recommendations']=re.findall('\d+',self.browser.find_element(By.CSS_SELECTOR,"fl-col[class='RecommendationsText']").text)[0]

            ### GET STATS ON PERFORMANCE
            pct_jobs_completed, pct_on_budget, pct_on_time, repeat_hire_rate = [item.text.\
                                                                                replace('%','') for item in self.browser.\
                                                                                find_elements(By.CSS_SELECTOR,"fl-text[class='ReputationItemAmount']")]
            
            user_info['pct_jobs_completed'] = pct_jobs_completed
            user_info['pct_on_budget'] = pct_on_budget
            user_info['pct_on_time'] = pct_on_time
            user_info['repeat_hire_rate'] = repeat_hire_rate
            ##############################
            
            #Add the extracted info the list for all users
            self.users_info.append(user_info)

            print("Fetched user info from", user_profile_link)

            #Close the browser when done
            self.browser.close()
        
        except Exception as e:

            print("Failed to fetch user profile:", user_profile_link)
            print(e)
          
    
    def get_pd_dataframe(self):
        """
        Exports data to a Pandas DataFrame
        """

        export_to_pd_dict_array = []

        # turn list features into individual columns
        for user in self.users_info:

            certifications = {}
            for cert in user.get("certifications", []):
                if len(cert) > 1:
                    cert_name, val = cert
                    certifications[f"certifications_{'_'.join(cert_name.split(' ')).lower()}"] = val
                else:
                    cert_name = cert[0]
                    certifications[f"certifications_{'_'.join(cert_name.split(' ')).lower()}"] = True

            # format is "verification_verification_name": value
            verifications = {f"verification_{'_'.join(name.split(' ')).lower()}": val for name, val in user.get("verifications", [])}

            # format is "skill_skill_name": value
            skills = {f"skill_{'_'.join(name.split(' ')).lower()}": val for name, val in user.get("top_skills", [])}

            # format is "badge_badge_name": True
            badges = {f"badge_{'_'.join(name.split('-')).lower()}": True for name in user.get("freelancer_badges", [])}

            user_copy = {**user}

            # delete columns that we've just processed
            del user_copy["certifications"]
            del user_copy["verifications"]
            del user_copy["top_skills"]
            del user_copy["freelancer_badges"]

            user_for_pd = {
                **user_copy,
                **certifications,
                **verifications,
                **skills,
                **badges
            }

            export_to_pd_dict_array.append(user_for_pd)

        # create dataframe and return
        df = pd.DataFrame(export_to_pd_dict_array)

        return df


    def get_raw_pd_dataframe(self):
        """
        Exports the data as a Pandas DataFrame without doing any pre-processing
        """
        df = pd.DataFrame(self.users_info)

        return df
    

    def run(self):
        
        # extract all the usernames for the particular search query
        self.get_all_usernames()
        
        users_fetched = 0
        # get all the info for each of the usernames
        for url in self.user_profiles:
            self.get_user_profile_info(url)
            users_fetched += 1
            print("Users fetched:", users_fetched)
            sleep(2.5)
    
        print("All done!")
  
        

In [None]:
scraper = FreelancerScraper(search="designer", max_rows=5000)
scraper.run()

Fetched user info from https://www.freelancer.com/u/brownfreelance
Users fetched: 1187
Getting profile information from https://www.freelancer.com/u/saadimran8892 ...
Fetched user info from https://www.freelancer.com/u/saadimran8892
Users fetched: 1188
Getting profile information from https://www.freelancer.com/u/Shaiguy88 ...
Fetched user info from https://www.freelancer.com/u/Shaiguy88
Users fetched: 1189
Getting profile information from https://www.freelancer.com/u/EffectiveKeY ...
Fetched user info from https://www.freelancer.com/u/EffectiveKeY
Users fetched: 1190
Getting profile information from https://www.freelancer.com/u/mysticpixels ...
Failed to fetch user profile: https://www.freelancer.com/u/mysticpixels
Message: no such element: Unable to locate element: {"method":"css selector","selector":"h2[data-color='mid']"}
  (Session info: headless chrome=90.0.4430.212)
Stacktrace:
#0 0x561fa3b757f9 <unknown>
#1 0x561fa3b153b3 <unknown>
#2 0x561fa385d016 <unknown>
#3 0x561fa389181e 

Users fetched: 4284
Getting profile information from https://www.freelancer.com/u/larrybrazil ...
Fetched user info from https://www.freelancer.com/u/larrybrazil
Users fetched: 4285
Getting profile information from https://www.freelancer.com/u/designbyfrances ...
Fetched user info from https://www.freelancer.com/u/designbyfrances
Users fetched: 4286
Getting profile information from https://www.freelancer.com/u/jameshauge ...
Fetched user info from https://www.freelancer.com/u/jameshauge
Users fetched: 4287
Getting profile information from https://www.freelancer.com/u/MoeAgha ...
Fetched user info from https://www.freelancer.com/u/MoeAgha
Users fetched: 4288
Getting profile information from https://www.freelancer.com/u/Cjmcpherson ...
Fetched user info from https://www.freelancer.com/u/Cjmcpherson
Users fetched: 4289
Getting profile information from https://www.freelancer.com/u/iyamidesignweb ...
Fetched user info from https://www.freelancer.com/u/iyamidesignweb
Users fetched: 4290
Gett

In [None]:
scraped_pd_df = scraper.get_pd_dataframe()

print(scraped_pd_df.head())

scraped_pd_df_raw = scraper.get_raw_pd_dataframe()

print(scraped_pd_df_raw.head())

  search_query        name                                 profile_link  \
0     designer    Milen S.  https://www.freelancer.com/u/MsCaddServices   
1     designer   Jeremy C.         https://www.freelancer.com/u/Conescu   
2     designer  Nichole W.       https://www.freelancer.com/u/NicholeMW   
3     designer   Robert H.     https://www.freelancer.com/u/rhoenig1277   
4     designer     Shea L.      https://www.freelancer.com/u/blaqsupply   

                                             tagline  \
0              Designer with Wide BIM and CAD Skills   
1                       Strategy and Creative Leader   
2                                    www.nicalan.com   
3  Web Developer, Designer, Wordpress, Graphic Ar...   
4       Helping people establish an online presence.   

                                    user_description  \
0  I am a building and interior designer with wid...   
1  I am an award-winning designer with 30 years o...   
2  I am a full service graphic designer with

In [None]:
scraped_pd_df.to_csv(f'{"_".join(scraper.search.split(" "))}.csv',index=False)
scraped_pd_df_raw.to_csv(f'{"_".join(scraper.search.split(" "))}_raw.csv',index=False)

## Quick test runs on a profiles

### a. User with badges & certifications

In [None]:
scraper = FreelancerScraper(search="designer", max_rows=5000)
scraper.get_user_profile_info('https://www.freelancer.com/u/TatianaLLL')
scraper.users_info

Getting profile information from https://www.freelancer.com/u/TatianaLLL ...
Fetched user info from https://www.freelancer.com/u/TatianaLLL


[{'search_query': 'designer',
  'name': 'Tatiana L.',
  'profile_link': 'https://www.freelancer.com/u/TatianaLLL',
  'freelancer_badges': ['plus-membership', 'preferred-freelancer', 'verified'],
  'verifications': [('Preferred Freelancer', True),
   ('Identity Verified', True),
   ('Payment Verified', True),
   ('Phone Verified', True),
   ('Email Verified', True),
   ('Facebook Connected', True)],
  'tagline': 'US Taxes, Bookkeeping, IRS Tax Debt Resolution.',
  'user_description': "Hello, my name is Tatiana Loughman & I'm:\n - an Enrolled Agent,\n - Chartered Economist, \n - QBO ProAdvisor,\n- Authorized E-File Provider.\n\nI'm enrolled to practice before The IRS.\nI have 10+ years of experience. \nI have many satisfied clients.\nI'm very diligent & dedicated to each client.\nI have a Masters in Economics, majoring in Accounting & Auditing;\n Bachelors in Criminal Justice;\n variety of certificates.\n\nI offer IRS Representation:\n- Offer in Compromise,\n- Penalties Abatement,\n- IRS

In [None]:
processed_df_test = scraper.get_pd_dataframe()
processed_df_test

Unnamed: 0,search_query,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,...,skill_personal_income_tax,skill_tax_accounting,skill_us_taxation,skill_startup_consulting,skill_tax_preparation,skill_workday_compensation,skill_workday_payroll,badge_plus_membership,badge_preferred_freelancer,badge_verified
0,designer,Tatiana L.,https://www.freelancer.com/u/TatianaLLL,"US Taxes, Bookkeeping, IRS Tax Debt Resolution.","Hello, my name is Tatiana Loughman & I'm:\n - ...","Heath, United States","October 27, 2017",$99,5.7,5.0,...,0,0,0,0,0,0,0,True,True,True


In [None]:
raw_df_test = scraper.get_raw_pd_dataframe()
raw_df_test

Unnamed: 0,search_query,name,profile_link,freelancer_badges,verifications,tagline,user_description,certifications,top_skills,location,join_date,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,repeat_hire_rate
0,designer,Tatiana L.,https://www.freelancer.com/u/TatianaLLL,"[plus-membership, preferred-freelancer, verified]","[(Preferred Freelancer, True), (Identity Verif...","US Taxes, Bookkeeping, IRS Tax Debt Resolution.","Hello, my name is Tatiana Loughman & I'm:\n - ...","[(US English 1, 92%), (Preferred Freelancer Pr...","[(Accounting, 29), (Intuit QuickBooks, 27), (F...","Heath, United States","October 27, 2017",$99,5.7,5.0,28,3,100,100,95,20


### b. User without badges & certifications

In [None]:
scraper = FreelancerScraper(search="designer", max_rows=5000)
scraper.get_user_profile_info('https://www.freelancer.com/u/TeenaVernekar')
scraper.users_info

Getting profile information from https://www.freelancer.com/u/TeenaVernekar ...
Fetched user info from https://www.freelancer.com/u/TeenaVernekar


[{'search_query': 'designer',
  'name': 'Teena V.',
  'profile_link': 'https://www.freelancer.com/u/TeenaVernekar',
  'freelancer_badges': None,
  'verifications': [('Preferred Freelancer', False),
   ('Identity Verified', True),
   ('Payment Verified', True),
   ('Phone Verified', True),
   ('Email Verified', True),
   ('Facebook Connected', False)],
  'tagline': 'Virtual Assistant - Financial Accounting & Billing',
  'user_description': 'I am self-motivated and precise individual with excellent skills in Financial Accounting, Billing, Lead Management, Virtual Assistance, Internet Research & Data Entry & Analysis, Software Quality Assurance & Testing.\n\nI was full time employed with Standard Chartered Bank for close to 5 years in the role of Portfolio Relationship Manager, MIS Officer, Quality Control Reviewer. Currently working as a freelancer. \n\nMy expertise lies in,\n• Internet Research, Web scrapping, Data Analytics, Data Mining, Content Management\n• Financial Accounting, Bill

---

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>