In [60]:
import kaggle
import os
import pandas as pd
import zipfile
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

## Download Data

In [61]:
os.makedirs("datasets", exist_ok=True)

In [62]:
#!/bin/bash
!kaggle datasets download -d urvishahir/global-freelancers-raw-dataset -p datasets/

Dataset URL: https://www.kaggle.com/datasets/urvishahir/global-freelancers-raw-dataset
License(s): CC0-1.0
global-freelancers-raw-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [63]:
# Ekstrak file zip
with zipfile.ZipFile("datasets/global-freelancers-raw-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("datasets/global-freelance")

## 1. Data Understanding

In [64]:
df = pd.read_csv('datasets/global-freelance/global_freelancers_raw.csv')

In [65]:
print("Jumlah data:", df.shape[0])

Jumlah data: 1000


In [66]:
print("Tipe data:")
print(df.dtypes)

Tipe data:
freelancer_ID           object
name                    object
gender                  object
age                    float64
country                 object
language                object
primary_skill           object
years_of_experience    float64
hourly_rate (USD)       object
rating                 float64
is_active               object
client_satisfaction     object
dtype: object


In [67]:
print("Jumlah nilai yang hilang:")
print(df.isnull().sum())

Jumlah nilai yang hilang:
freelancer_ID            0
name                     0
gender                   0
age                     30
country                  0
language                 0
primary_skill            0
years_of_experience     51
hourly_rate (USD)       94
rating                 101
is_active               89
client_satisfaction    176
dtype: int64


In [68]:
df.select_dtypes('number').columns

Index(['age', 'years_of_experience', 'rating'], dtype='object')

In [69]:
for column in df.select_dtypes('number').columns:  # Loop melalui fitur numerik
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\nOutlier pada kolom {column}:")
    print(outliers)


Outlier pada kolom age:
Empty DataFrame
Columns: [freelancer_ID, name, gender, age, country, language, primary_skill, years_of_experience, hourly_rate (USD), rating, is_active, client_satisfaction]
Index: []

Outlier pada kolom years_of_experience:
    freelancer_ID                    name  gender   age         country  \
135      FL250136           Thomas Ingram    MALE  60.0    South Africa   
325      FL250326          Jennifer Allen  FEMALE  58.0           Spain   
559      FL250560           Rodney Wilson    male  60.0       Argentina   
601      FL250602              Ryan Ewing    male  60.0   United States   
629      FL250630        Angela Blackwell       F  60.0          Turkey   
760      FL250761              John White       m  60.0           China   
953      FL250954  Miss Kimberly Marshall       F  59.0  United Kingdom   
964      FL250965            Latoya Brown  Female  59.0  United Kingdom   
985      FL250986              Bradley Wu       m  60.0           Spain   


In [70]:
print("Deskripsi statistik:")
df.describe()

Deskripsi statistik:


Unnamed: 0,age,years_of_experience,rating
count,970.0,949.0,899.0
mean,40.509278,11.340358,2.51257
std,11.942605,9.68061,1.546599
min,20.0,0.0,0.0
25%,31.0,3.0,1.4
50%,41.0,9.0,2.6
75%,51.0,17.0,3.8
max,60.0,41.0,5.0


## 2. Membersihkan Data

### Memperbaiki kesalahan pada data

In [71]:
# Kolom "gender"
df['gender'] = df['gender'].apply(lambda x: x.lower())
mapper = {
    'f': 'female',
    'm': 'male',
    'male': 'male',
    'female': 'female'
}
df['gender'] = df['gender'].map(mapper, na_action='ignore')

In [72]:
# hourly_rate
df['hourly_rate (USD)'] = df['hourly_rate (USD)'].str.replace(r"\$|USD", "", regex=True).str.strip()
df['hourly_rate (USD)'] = pd.to_numeric(df['hourly_rate (USD)'])

In [73]:
# is_active
mapper = {
    '0': 'no',
    '1': 'yes',
    'N': 'no',
    'Y': 'yes',
    'False': 'no',
    'True': 'yes'
}
df['is_active'] = df['is_active'].map(mapper)

In [74]:
# client_satisfaction
df['client_satisfaction'] = df['client_satisfaction'].str.replace(r"\%", "", regex=True).str.strip()
df['client_satisfaction'] = pd.to_numeric(df['client_satisfaction'])

### Mengisi nilai hilang

In [75]:
df.head()

Unnamed: 0,freelancer_ID,name,gender,age,country,language,primary_skill,years_of_experience,hourly_rate (USD),rating,is_active,client_satisfaction
0,FL250001,Ms. Nicole Kidd,female,52.0,Italy,Italian,Blockchain Development,11.0,100.0,,no,
1,FL250002,Vanessa Garcia,female,52.0,Australia,English,Mobile Apps,34.0,100.0,3.3,yes,84.0
2,FL250003,Juan Nelson,male,53.0,Germany,German,Graphic Design,31.0,50.0,0.0,no,71.0
3,FL250004,Amanda Spencer,female,38.0,Australia,English,Web Development,4.0,40.0,1.5,no,90.0
4,FL250005,Lynn Curtis DDS,female,53.0,Germany,German,Web Development,27.0,30.0,4.8,no,83.0


In [76]:
df.isna().sum()

freelancer_ID            0
name                     0
gender                   0
age                     30
country                  0
language                 0
primary_skill            0
years_of_experience     51
hourly_rate (USD)       94
rating                 101
is_active              262
client_satisfaction    176
dtype: int64

In [77]:
# age
df['age'].fillna(df['age'].mean(), inplace=True)

# years_of_experience
df['years_of_experience'].fillna(df['years_of_experience'].mean(), inplace=True)

# hourly_rate
df['hourly_rate (USD)'].fillna(df['hourly_rate (USD)'], inplace=True)

# rating
df['rating'].fillna(df['rating'].mean(), inplace=True)

# is_active
df['is_active'].fillna(df['is_active'].mode(), inplace=True)

# client_satistaction
df['client_satisfaction'].fillna(df['client_satisfaction'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['years_of_experience'].fillna(df['years_of_experience'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

### Memperbaiki outlier

In [78]:
for column in df.select_dtypes('number').columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)

In [79]:
print("Deskripsi statistik setelah pembersihan:")
print(df.describe())

Deskripsi statistik setelah pembersihan:
               age  years_of_experience  hourly_rate (USD)       rating  \
count  1000.000000          1000.000000         906.000000  1000.000000   
mean     40.509278            11.325358          52.461369     2.512570   
std      11.761920             9.386168          27.323742     1.466335   
min      20.000000             0.000000          20.000000     0.000000   
25%      31.000000             3.000000          30.000000     1.500000   
50%      40.509278            10.000000          40.000000     2.512570   
75%      51.000000            17.000000          75.000000     3.700000   
max      60.000000            38.000000         100.000000     5.000000   

       client_satisfaction  
count          1000.000000  
mean             79.269417  
std              10.433554  
min              60.000000  
25%              71.000000  
50%              79.269417  
75%              87.000000  
max             100.000000  
