# Data Cleaner

**`Goal:`** Clean the data extracted from freelancer.com in preparation for gender identification, EDA, modeling and matching

### 1. Import packages

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from glob import glob

### 2. Load the data

In [None]:
file_paths = [file for file in glob("../data/raw/*.csv") if '_raw' not in file]
file_paths

['../data/raw/copywriter.csv',
 '../data/raw/software_engineer.csv',
 '../data/raw/designer.csv',
 '../data/raw/accountant.csv']

### 3. Exploring the data to inform cleaning

In [None]:
df = pd.read_csv(file_paths[0])
df.head()

Unnamed: 0,search_query,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,...,skill_electric_repair,skill_handyman,skill_order_processing,skill_analytics,skill_comics,skill_public_speaking,skill_political_science,skill_activecampaign,skill_automotive_engineering,skill_influencer_marketing
0,copywriter,Marty P.,https://www.freelancer.com/u/mpekar,Very experienced advertising copywriter,I'm a semi-retired advertising copywriter with...,"Greenwich, United States","July 18, 2011",$20,3.6,5.0,...,,,,,,,,,,
1,copywriter,Cat A.,https://www.freelancer.com/u/Cadduci,"Marketing, Business and Administrative Support",My name is Cat and I am a Marketing and Commun...,"Hatboro, United States","August 28, 2021",$15,0.0,0.0,...,,,,,,,,,,
2,copywriter,Jonathan G.,https://www.freelancer.com/u/JG24,"Copy Editor, Copywriter, and Marketing Specialist","Wordsmith, editing extraordinaire, and dedicat...","Warwick, United States","January 26, 2016",$20,0.0,0.0,...,,,,,,,,,,
3,copywriter,Melanie B.,https://www.freelancer.com/u/Melwritesindy,"Creative Writer, Copywriter, Mom",I studied Creative Writing and English Educati...,"Indianapolis, United States","December 14, 2021",$50,0.0,0.0,...,,,,,,,,,,
4,copywriter,Brittney G.,https://www.freelancer.com/u/satisfiedsoulcre,Copywriter & Brand Content Creator,I am your secret weapon to bettering your bran...,"Memphis, United States","April 12, 2021",$25,0.0,0.0,...,,,,,,,,,,


#### a. Observing non-skill columns to inform data cleaning

In [None]:
df.loc[:,[col for col in df.columns if 'skill' not in col]]

Unnamed: 0,search_query,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,...,certifications_spanish_1,certifications_german_1,certifications_joomla!_1,certifications_drupal_1,certifications_mysql_1,certifications_seo_2,certifications_general_orientation_exam_1,certifications_uk_english_1,certifications_seo_1,certifications_academic_writing_2
0,copywriter,Marty P.,https://www.freelancer.com/u/mpekar,Very experienced advertising copywriter,I'm a semi-retired advertising copywriter with...,"Greenwich, United States","July 18, 2011",$20,3.6,5.0,...,,,,,,,,,,
1,copywriter,Cat A.,https://www.freelancer.com/u/Cadduci,"Marketing, Business and Administrative Support",My name is Cat and I am a Marketing and Commun...,"Hatboro, United States","August 28, 2021",$15,0.0,0.0,...,,,,,,,,,,
2,copywriter,Jonathan G.,https://www.freelancer.com/u/JG24,"Copy Editor, Copywriter, and Marketing Specialist","Wordsmith, editing extraordinaire, and dedicat...","Warwick, United States","January 26, 2016",$20,0.0,0.0,...,,,,,,,,,,
3,copywriter,Melanie B.,https://www.freelancer.com/u/Melwritesindy,"Creative Writer, Copywriter, Mom",I studied Creative Writing and English Educati...,"Indianapolis, United States","December 14, 2021",$50,0.0,0.0,...,,,,,,,,,,
4,copywriter,Brittney G.,https://www.freelancer.com/u/satisfiedsoulcre,Copywriter & Brand Content Creator,I am your secret weapon to bettering your bran...,"Memphis, United States","April 12, 2021",$25,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,copywriter,Theodore G.,https://www.freelancer.com/u/greenequail,Writer and Illustrator,I am a great writer and drawer of great pictur...,"Scottsdale, United States","August 6, 2020",$20,4.7,4.9,...,,,,,,,,,,
659,copywriter,Luminacht,https://www.freelancer.com/u/Luminacht,Professional Editor,"I am a skilled and experienced Editor, Proofre...","Kissimmee, United States","October 5, 2020",$12,0.0,0.0,...,,,,,,,,,,
660,copywriter,Emma D.,https://www.freelancer.com/u/emmadenning132,Copywriter and Editor,I have been Copywriting as a side hustle for a...,"Bryn Mawr, United States","November 10, 2020",$11,0.0,0.0,...,,,,,,,,,,
661,copywriter,Jonathan C.,https://www.freelancer.com/u/tcoffin014,"Email Marketing Copywriter, Mechanical Engineer","Jack of all trades. Mechanical engineer, car a...","Denham Springs, United States","January 10, 2022",$150,0.0,0.0,...,,,,,,,,,,


In [None]:
df.certifications_foundation_vworker_member.unique()

array([nan, True], dtype=object)

#### b. Noting down to-do's
- ~Get first name~
- ~Drop tagline and user_description~
- ~Drop duplicates and missing data~
- ~Convert join date to more workable version~
- ~Remove '%' from certification columns and reformat as int or float. Maybe also rename columns to `pct_certifications.....`~
- ~Fill badge_plus_membership NA with False~
- ~Maybe? : Remove 'United States' from location (since everything is in the US)~


### 4. Define data cleaning and merging function

In [None]:
def data_cleaner_and_merger(df_list):

    """
    Merges a list of dataframes and cleans the larger dataframe
    """

    #1. Merge the dataframes into one big dataframe
    df = pd.concat(df_list,ignore_index=True)

    # 2. Drop columns that a) have too much missing data or b) are irrelevant
    df_clean = df.drop(['tagline', 'user_description', 'repeat_hire_rate'], axis = 1)

    # 3. Drop duplicates
    if sum(df_clean.duplicated()) > 0:
        df_clean = df_clean.drop_duplicates()
    
    # 4. Drop rows with missing data for hourly_rate column
    df_clean = df_clean.dropna(subset=['hourly_rate'])

    # 5. Convert join date into time on freelancer (baseline: earliest join date)
    join_dates = pd.to_datetime(df_clean.join_date)
    join_date_from_earliest = join_dates - min(join_dates)
    df_clean['join_date_from_earliest'] = [t_delta.days for t_delta in join_date_from_earliest]
    df_clean = df_clean.drop(['join_date'], axis = 1)


    # 6. If all hourly rates are given in dollars, remove dollar sign and convert to float
    if (len(df_clean['hourly_rate']) == sum(df_clean['hourly_rate'].str.count('$'))):
        df_clean['hourly_rate'] = df_clean['hourly_rate'].str.replace('$', '', regex = True).astype('float')
    
    # 7. Remove last name initial
    df_clean['name'] = df_clean['name'].str.replace('\s\w.', '', regex = True)

    # 8. Remove '%' from certification columns, reformat as float, and rename columns with 'pct_'
    certColumns = [col for col in df.columns if 'certifications' in col]
    for col in certColumns:
        try:
            df_clean[col] = df_clean[col].replace('%', '', regex = True).astype('float')
        except:
            try:
                print(f"error encountered in {col}.")
                df_clean[col] = df_clean[col].replace('True', np.nan)
                df_clean[col] = df_clean[col].replace('%', '', regex = True).astype('float')
                print(f"replacing 'True' with NaN. If no further error for column, issue resolved\n")
            except:
                print(f"error unresolved in {col}. proceeding without resolution\n")

    df_clean.columns = np.where(df_clean.columns.isin(certColumns), 'pct_' + df_clean.columns, df_clean.columns)

    # 9. Fill missing badges with False
    df_clean['badge_plus_membership'] =  df_clean['badge_plus_membership'].fillna(False)
    df_clean['badge_preferred_freelancer'] =  df_clean['badge_preferred_freelancer'].fillna(False)
    df_clean['badge_verified'] =  df_clean['badge_verified'].fillna(False)

    # 10. Remove 'United States' from location
    df_clean['location'] = df_clean['location'].str.split(',').str[0]    

    return df_clean

    

### 5. Run data cleaner on data files

In [None]:
#Load the data files into pandas dataframes
files = [pd.read_csv(file) for file in file_paths]
files[0].head()

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,search_query,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,...,skill_electric_repair,skill_handyman,skill_order_processing,skill_analytics,skill_comics,skill_public_speaking,skill_political_science,skill_activecampaign,skill_automotive_engineering,skill_influencer_marketing
0,copywriter,Marty P.,https://www.freelancer.com/u/mpekar,Very experienced advertising copywriter,I'm a semi-retired advertising copywriter with...,"Greenwich, United States","July 18, 2011",$20,3.6,5.0,...,,,,,,,,,,
1,copywriter,Cat A.,https://www.freelancer.com/u/Cadduci,"Marketing, Business and Administrative Support",My name is Cat and I am a Marketing and Commun...,"Hatboro, United States","August 28, 2021",$15,0.0,0.0,...,,,,,,,,,,
2,copywriter,Jonathan G.,https://www.freelancer.com/u/JG24,"Copy Editor, Copywriter, and Marketing Specialist","Wordsmith, editing extraordinaire, and dedicat...","Warwick, United States","January 26, 2016",$20,0.0,0.0,...,,,,,,,,,,
3,copywriter,Melanie B.,https://www.freelancer.com/u/Melwritesindy,"Creative Writer, Copywriter, Mom",I studied Creative Writing and English Educati...,"Indianapolis, United States","December 14, 2021",$50,0.0,0.0,...,,,,,,,,,,
4,copywriter,Brittney G.,https://www.freelancer.com/u/satisfiedsoulcre,Copywriter & Brand Content Creator,I am your secret weapon to bettering your bran...,"Memphis, United States","April 12, 2021",$25,0.0,0.0,...,,,,,,,,,,


In [None]:
cleaned_df = data_cleaner_and_merger(files)

error encountered in certifications_us_english_1.
replacing 'True' with NaN. If no further error for column, issue resolved

error encountered in certifications_freelancer_orientation_1.
replacing 'True' with NaN. If no further error for column, issue resolved

error encountered in certifications_employer_orientation_exam_1.
replacing 'True' with NaN. If no further error for column, issue resolved



### 6. Write merged and cleaned dataframe to CSV 

In [None]:
cleaned_df.to_csv('../data/interim/cleaned-merge.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>