# 1. Import Libraries

In [51]:
import numpy as np # Numeric
import pandas as pd # table
from datetime import datetime # date 
import matplotlib.pyplot as plt # Visualization
import seaborn as sns # Visualization
import sklearn # ML
from currency_converter import CurrencyConverter # CurrencyRates

# 2. Loading data

In [52]:
# Import csv file
df = pd.read_csv("Data_Science_Fields_Salary_Categorization.csv")

# 3. Data Check

In [53]:
# Get information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            607 non-null    int64 
 1   Working_Year          607 non-null    int64 
 2   Designation           607 non-null    object
 3   Experience            607 non-null    object
 4   Employment_Status     607 non-null    object
 5   Salary_In_Rupees      607 non-null    object
 6   Employee_Location     607 non-null    object
 7   Company_Location      607 non-null    object
 8   Company_Size          607 non-null    object
 9   Remote_Working_Ratio  607 non-null    int64 
dtypes: int64(3), object(7)
memory usage: 47.5+ KB


In [54]:
# Check header column
df.columns

Index(['Unnamed: 0', 'Working_Year', 'Designation', 'Experience',
       'Employment_Status', 'Salary_In_Rupees', 'Employee_Location',
       'Company_Location', 'Company_Size', 'Remote_Working_Ratio'],
      dtype='object')

In [55]:
# Check first 10 data row
df.head(10)

Unnamed: 0.1,Unnamed: 0,Working_Year,Designation,Experience,Employment_Status,Salary_In_Rupees,Employee_Location,Company_Location,Company_Size,Remote_Working_Ratio
0,1,2020,Data Scientist,MI,FT,6352272.0,DE,DE,L,0
1,2,2020,Machine Learning Scientist,SE,FT,20688070.0,JP,JP,S,0
2,3,2020,Big Data Engineer,SE,FT,8674985.0,GB,GB,M,50
3,4,2020,Product Data Analyst,MI,FT,1591390.0,HN,HN,S,0
4,5,2020,Machine Learning Engineer,SE,FT,11935425.0,US,US,L,50
5,6,2020,Data Analyst,EN,FT,5729004.0,US,US,L,100
6,7,2020,Lead Data Scientist,SE,FT,15118205.0,US,US,S,100
7,8,2020,Data Scientist,MI,FT,2843416.0,HU,HU,L,50
8,9,2020,Business Data Analyst,MI,FT,10741883.0,US,US,L,100
9,10,2020,Lead Data Engineer,SE,FT,9946188.0,NZ,NZ,S,50


In [56]:
# Check for any missing values
df.isnull().sum()

Unnamed: 0              0
Working_Year            0
Designation             0
Experience              0
Employment_Status       0
Salary_In_Rupees        0
Employee_Location       0
Company_Location        0
Company_Size            0
Remote_Working_Ratio    0
dtype: int64

# 4. Data Cleaning

In [57]:
# Convert all column names to lowercase
df.columns = df.columns.str.lower()
df

Unnamed: 0,unnamed: 0,working_year,designation,experience,employment_status,salary_in_rupees,employee_location,company_location,company_size,remote_working_ratio
0,1,2020,Data Scientist,MI,FT,6352272.00,DE,DE,L,0
1,2,2020,Machine Learning Scientist,SE,FT,20688070.00,JP,JP,S,0
2,3,2020,Big Data Engineer,SE,FT,8674985.00,GB,GB,M,50
3,4,2020,Product Data Analyst,MI,FT,1591390.00,HN,HN,S,0
4,5,2020,Machine Learning Engineer,SE,FT,11935425.00,US,US,L,50
...,...,...,...,...,...,...,...,...,...,...
602,603,2022,Data Engineer,SE,FT,12253703.00,US,US,M,100
603,604,2022,Data Engineer,SE,FT,10025757.00,US,US,M,100
604,605,2022,Data Analyst,SE,FT,10264466.00,US,US,M,0
605,606,2022,Data Analyst,SE,FT,11935425.00,US,US,M,100


In [58]:
# Drop unused column
df = df.drop(columns=['unnamed: 0'])

In [59]:
# Replace values in Designation
def title_job(title):
    if 'data scientist' in title.lower() or 'machine learning' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'analyst' in title.lower() or 'analytic' in title.lower() or 'bi' in title.lower():
        return 'data analyst'
    else:
        return 'other'

df['job_role'] = df['designation'].apply(title_job)

df = df.drop(columns=['designation'])

In [60]:
# Check unique value in Experience column
unique_experience = df['experience'].unique()
print(unique_experience)

['MI' 'SE' 'EN' 'EX']


In [61]:
#  Replace values in experience-level column
df['experience'] = df['experience'].replace('EN', 'Entry-Level')
df['experience'] = df['experience'].replace('EX', 'Experienced')
df['experience'] = df['experience'].replace('MI', 'Mid-Level')
df['experience'] = df['experience'].replace('SE', 'Senior')

In [62]:
# Check unique value in Employment_Status column
unique_employment_status = df['employment_status'].unique()
print(unique_employment_status)

['FT' 'CT' 'PT' 'FL']


In [63]:
#Replace values in employment_type column
df['employment_status'] = df['employment_status'].replace('FT', 'Full-Time')
df['employment_status'] = df['employment_status'].replace('CT', 'Contractor')
df['employment_status'] = df['employment_status'].replace('FL', 'Freelancer')
df['employment_status'] = df['employment_status'].replace('PT', 'Part-Time')

In [64]:
# Check unique value in company_size column
unique_company_size = df['company_size'].unique()
print(unique_company_size)

['L' 'S' 'M']


In [65]:
#Replace values in Company size column
df['company_size'] = df['company_size'].replace('L', "Large")
df['company_size'] = df['company_size'].replace('M', "Medium")
df['company_size'] = df['company_size'].replace('S', "Small")

In [66]:
df

Unnamed: 0,working_year,experience,employment_status,salary_in_rupees,employee_location,company_location,company_size,remote_working_ratio,job_role
0,2020,Mid-Level,Full-Time,6352272.00,DE,DE,Large,0,data scientist
1,2020,Senior,Full-Time,20688070.00,JP,JP,Small,0,data scientist
2,2020,Senior,Full-Time,8674985.00,GB,GB,Medium,50,data engineer
3,2020,Mid-Level,Full-Time,1591390.00,HN,HN,Small,0,data analyst
4,2020,Senior,Full-Time,11935425.00,US,US,Large,50,data scientist
...,...,...,...,...,...,...,...,...,...
602,2022,Senior,Full-Time,12253703.00,US,US,Medium,100,data engineer
603,2022,Senior,Full-Time,10025757.00,US,US,Medium,100,data engineer
604,2022,Senior,Full-Time,10264466.00,US,US,Medium,0,data analyst
605,2022,Senior,Full-Time,11935425.00,US,US,Medium,100,data analyst


In [73]:
type(df['salary_in_rupees'][0])

str

In [69]:
df['salary_in_rupees'] = df['salary_in_rupees'].astype(int)

NameError: name 'int64' is not defined

In [None]:
# int(df['salary_in_rupees'])
# Create a CurrencyRates object
c = CurrencyConverter()

# Convert salary from rupee(INR) to THB
conversion_rate = c.convert(1, 'INR', 'THB')  # Get conversion rate from 1 INR to THB
print(conversion_rate)
# Rename column
df.rename(columns={'salary_in_rupees': 'salary_in_THB'}, inplace=True)

# Salary values from INR to THB
df['salary_in_THB'] = df['salary_in_THB'] * conversion_rate

# Display the updated DataFrame
df.head(2)

In [None]:
# Create work experience year column
current_year = datetime.now().year # Get the current year

df['work_experience'] = current_year - df['working_year'] # Calculate work experience

df.head(5)

# 4. Data Exploration

In [None]:
df.describe()

In [None]:

df