In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("global_freelancers_raw.csv", encoding="latin1")

# ***Data Exploration***

In [127]:
df.head()

Unnamed: 0,freelancer_ID,name,gender,age,country,language,primary_skill,years_of_experience,hourly_rate (USD),rating,is_active,client_satisfaction
0,FL250001,Ms. Nicole Kidd,f,52.0,Italy,Italian,Blockchain Development,11.0,100,,0,
1,FL250002,Vanessa Garcia,FEMALE,52.0,Australia,English,Mobile Apps,34.0,USD 100,3.3,1,84%
2,FL250003,Juan Nelson,male,53.0,Germany,German,Graphic Design,31.0,50,0.0,N,71%
3,FL250004,Amanda Spencer,F,38.0,Australia,English,Web Development,4.0,$40,1.5,N,90%
4,FL250005,Lynn Curtis DDS,female,53.0,Germany,German,Web Development,27.0,30,4.8,0,83%


In [128]:
df.shape

(1000, 12)

In [129]:
df.columns

Index(['freelancer_ID', 'name', 'gender', 'age', 'country', 'language',
       'primary_skill', 'years_of_experience', 'hourly_rate (USD)', 'rating',
       'is_active', 'client_satisfaction'],
      dtype='object')

In [130]:
df.describe()

Unnamed: 0,age,years_of_experience,rating
count,970.0,949.0,899.0
mean,40.509278,11.340358,2.51257
std,11.942605,9.68061,1.546599
min,20.0,0.0,0.0
25%,31.0,3.0,1.4
50%,41.0,9.0,2.6
75%,51.0,17.0,3.8
max,60.0,41.0,5.0


In [131]:
df.describe(include="O")

Unnamed: 0,freelancer_ID,name,gender,country,language,primary_skill,hourly_rate (USD),is_active,client_satisfaction
count,1000,1000,1000,1000,1000,1000,906,911,824
unique,1000,992,10,21,16,10,18,8,81
top,FL251000,Amy Lee,FEMALE,South Korea,English,DevOps,40,1,68%
freq,1,2,115,68,215,112,94,190,23


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_ID        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  970 non-null    float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  949 non-null    float64
 8   hourly_rate (USD)    906 non-null    object 
 9   rating               899 non-null    float64
 10  is_active            911 non-null    object 
 11  client_satisfaction  824 non-null    object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


In [133]:
df.isnull().sum()

freelancer_ID            0
name                     0
gender                   0
age                     30
country                  0
language                 0
primary_skill            0
years_of_experience     51
hourly_rate (USD)       94
rating                 101
is_active               89
client_satisfaction    176
dtype: int64

In [134]:
df.tail()

Unnamed: 0,freelancer_ID,name,gender,age,country,language,primary_skill,years_of_experience,hourly_rate (USD),rating,is_active,client_satisfaction
995,FL250996,Albert Wilcox,Male,56.0,Turkey,Turkish,DevOps,13.0,100,0.0,no,68%
996,FL250997,Cheryl Norris,f,26.0,Germany,German,Blockchain Development,6.0,USD 40,2.8,N,82
997,FL250998,Kathy Watkins,female,37.0,Japan,Japanese,Data Analysis,15.0,75,,False,94%
998,FL250999,John Obrien,m,46.0,Russia,Russian,Machine Learning,22.0,100,2.8,yes,97
999,FL251000,Dawn Green,Female,36.0,Mexico,Spanish,UI/UX Design,18.0,$20,1.7,1,72


# ***Data Cleaning***

### 1. Formatting the column names

In [135]:
# Cleaning column Names
col_names = list()
for i in df.columns:
    if i.__contains__(" "):
        col_names.append(i.split()[0].title())
        continue
    col_names.append(i.title())
df.columns = col_names
df.columns

Index(['Freelancer_Id', 'Name', 'Gender', 'Age', 'Country', 'Language',
       'Primary_Skill', 'Years_Of_Experience', 'Hourly_Rate', 'Rating',
       'Is_Active', 'Client_Satisfaction'],
      dtype='object')

### 2. Removing Duplicate rows

In [136]:
df.drop_duplicates(inplace=True)

### 3. Handling NaN values

In [137]:
df.isna().sum()

Freelancer_Id            0
Name                     0
Gender                   0
Age                     30
Country                  0
Language                 0
Primary_Skill            0
Years_Of_Experience     51
Hourly_Rate             94
Rating                 101
Is_Active               89
Client_Satisfaction    176
dtype: int64

In [138]:
# Handling NaN values in Age column
df["Age"] = df["Age"].fillna(df["Age"].median())

In [139]:
# Handling NaN values in Years_Of_Experience column
df["Years_Of_Experience"] = df["Years_Of_Experience"].fillna(df["Years_Of_Experience"].median())

In [140]:
# Handling NaN values in Hourly_Rate column
df["Hourly_Rate"] = df["Hourly_Rate"].replace(r"[^0-9]", "", regex=True).astype('float')
df["Hourly_Rate"] = df["Hourly_Rate"].fillna(df["Hourly_Rate"].median())

In [141]:
# Handling NaN values in Rating column
df["Rating"] = df["Rating"].fillna(df["Rating"].median())

In [142]:
# Handling NaN values in Is_Active column
df["Is_Active"] = df["Is_Active"].str.strip().str.lower()
print(df["Is_Active"].unique())
replacements = {
    'n' : '0',
    'y' : '1',
    'false' : '0',
    'true' : '1',
    'yes' : '1',
    'no' : '0'
}
df["Is_Active"] = df["Is_Active"].replace(replacements)
df["Is_Active"] = df["Is_Active"].fillna(df["Is_Active"].mode()[0])
df["Is_Active"] = df["Is_Active"].astype(int)
df["Is_Active"].dtypes

['0' '1' 'n' 'false' 'true' 'yes' 'y' nan 'no']


dtype('int64')

In [143]:
# Handling NaN values in Client_Satisfaction column
df["Client_Satisfaction"] = df["Client_Satisfaction"].str.strip()
print(df["Client_Satisfaction"].unique())
df["Client_Satisfaction"] = df["Client_Satisfaction"].replace(r"[^0-9]", "", regex=True)
df["Client_Satisfaction"] = pd.to_numeric(df["Client_Satisfaction"], errors="coerce")
df["Client_Satisfaction"] = df["Client_Satisfaction"].fillna(round(df["Client_Satisfaction"].mean()))

[nan '84%' '71%' '90%' '83%' '94%' '76%' '77%' '86%' '93%' '70%' '69%'
 '60%' '87%' '75%' '68%' '65%' '100%' '92' '89%' '62%' '82' '81%' '63%'
 '67%' '80%' '74%' '85%' '79%' '72%' '64' '88' '96%' '96' '81' '61%' '97%'
 '64%' '73%' '88%' '72' '92%' '82%' '93' '83' '78' '95%' '80' '87' '66%'
 '78%' '68' '91%' '97' '60' '70' '99%' '76' '86' '95' '74' '100' '73' '67'
 '77' '98%' '71' '85' '91' '94' '84' '90' '62' '65' '75' '63' '61' '66'
 '99' '79' '69' '89']


In [144]:
df.isna().sum()

Freelancer_Id          0
Name                   0
Gender                 0
Age                    0
Country                0
Language               0
Primary_Skill          0
Years_Of_Experience    0
Hourly_Rate            0
Rating                 0
Is_Active              0
Client_Satisfaction    0
dtype: int64

### 4. Cleaning The Inconsistent Data

In [151]:
# Freelancer Id column
df["Freelancer_Id"] = df["Freelancer_Id"].str.strip()

In [156]:
# Name column
df["Name"] = df["Name"].str.strip().replace(r"[^a-zA-Z0-9 ]", "", regex=True)

In [160]:
# Gender Column
df["Gender"] = df["Gender"].str.lower().str.strip()
print(df["Gender"].unique())
replacements = {
    'f' : "Female",
    'female' : "Female",
    'm' : "Male",
    'male' : "Male"
}
df["Gender"] = df["Gender"].replace(replacements)
print(df["Gender"].unique())

['female' 'male']
['Female' 'Male']


In [165]:
# Country columnn
df["Country"].unique()
# Data is already organized perfectly

array(['Italy', 'Australia', 'Germany', 'Netherlands', 'Indonesia',
       'United States', 'Turkey', 'United Kingdom', 'Argentina', 'Japan',
       'India', 'Brazil', 'South Korea', 'Russia', 'Canada', 'France',
       'Egypt', 'South Africa', 'China', 'Mexico', 'Spain'], dtype=object)

In [167]:
# Language column
df["Language"].unique()
# Data is already organized perfectly

array(['Italian', 'English', 'German', 'Dutch', 'Indonesian', 'Turkish',
       'Spanish', 'Japanese', 'Hindi', 'Portuguese', 'Korean', 'Russian',
       'French', 'Arabic', 'Afrikaans', 'Mandarin'], dtype=object)

In [170]:
# Primary Skill column
df["Primary_Skill"].unique()
# Data is already organized perfectly

array(['Blockchain Development', 'Mobile Apps', 'Graphic Design',
       'Web Development', 'AI', 'Data Analysis', 'UI/UX Design',
       'Cybersecurity', 'DevOps', 'Machine Learning'], dtype=object)