# Read Data

In [54]:
import pandas as pd
data = pd.read_csv('ds_salaries.csv', header=None)
data.columns = ['work_year', 'experience_level', 'employment_type', 'job_title',
                'salary', 'salary_currency', 'salary_in_usd',
                'employee_residence', 'remote_ratio', 'company_location','company_size']

data = data.drop(['work_year'],axis=1)
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 3755
Number of attributes = 10


Unnamed: 0,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


# Missing Data

In [55]:
import numpy as np
from sklearn.impute import SimpleImputer

data = data.replace('?',np.NaN)
imputer = SimpleImputer(strategy='mean')

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

Number of instances = 3755
Number of attributes = 10
Number of missing values:
	experience_level: 0
	employment_type: 0
	job_title: 0
	salary: 0
	salary_currency: 0
	salary_in_usd: 0
	employee_residence: 0
	remote_ratio: 0
	company_location: 0
	company_size: 0


In [56]:
#if missing value

#data2 = data['employee_residence']
#print('Before replacing missing values:')
#print(data2[20:25])
#data2 = data2.fillna(data2.median())
#print('\nAfter replacing missing values:')
#print(data2[20:25])

In [57]:
print('Number of rows in original data = %d' % (data.shape[0]))
data2 = data.dropna()
print('Number of rows after discarding missing values = %d' % (data2.shape[0]))

Number of rows in original data = 3755
Number of rows after discarding missing values = 3755


# Sampling

In [58]:
from sklearn.model_selection import train_test_split
import random

features = ['salary', 'salary_in_usd', 'remote_ratio'] #特徵列
data[features] = imputer.fit_transform(data[features]) #轉換數值

X = data[features]
y = data["salary_in_usd"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #test_size測試資料比例

In [59]:
# 使用 sample_without_replacement()
sampled_data = random.sample(list(X_train.index), k=100)  #100個樣本

# 取樣
sampled_X_train = X_train.loc[sampled_data]
sampled_y_train = y_train.loc[sampled_data]

sampled_X_train.head()

Unnamed: 0,salary,salary_in_usd,remote_ratio
2152,243000.0,243000.0,100.0
1215,180000.0,180000.0,100.0
2674,191475.0,191475.0,100.0
1290,110000.0,110000.0,100.0
2095,165000.0,165000.0,0.0


# Binarize

In [77]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=60000) #二元化分界

binary_data = binarizer.transform(data['salary_in_usd'].values.reshape(-1, 1))
data['salary_binary'] = binary_data

  data['salary_binary'] = binary_data


In [81]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data['salary_in_usd'].values.reshape(-1, 1))

encoded_columns = encoder.get_feature_names_out(['salary_in_usd'])
data[encoded_columns] = encoded_data.toarray()

print(encoded_columns)

['salary_in_usd_5132.0' 'salary_in_usd_5409.0' 'salary_in_usd_5679.0' ...
 'salary_in_usd_423834.0' 'salary_in_usd_430967.0'
 'salary_in_usd_450000.0']
