# Read Data

In [272]:
import pandas as pd
import numpy as np

data = pd.read_csv('ds_salaries.csv', header=None)
data.columns = ['work_year', 'experience_level', 'employment_type', 'job_title',
                'salary', 'salary_currency', 'salary_in_usd',
                'employee_residence', 'remote_ratio', 'company_location','company_size']

#data = data.drop(['work_year'],axis=1) #刪除
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 3755
Number of attributes = 11


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


# Missing Data

In [248]:
from sklearn.impute import SimpleImputer

data = data.replace('?',np.NaN)
imputer = SimpleImputer(strategy='mean')

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

Number of instances = 3755
Number of attributes = 11
Number of missing values:
	work_year: 0
	experience_level: 0
	employment_type: 0
	job_title: 0
	salary: 0
	salary_currency: 0
	salary_in_usd: 0
	employee_residence: 0
	remote_ratio: 0
	company_location: 0
	company_size: 0


In [249]:
#if missing value

#data2 = data['employee_residence']
#print('Before replacing missing values:')
#print(data2[20:25])
#data2 = data2.fillna(data2.median())
#print('\nAfter replacing missing values:')
#print(data2[20:25])

In [250]:
print('Number of rows in original data = %d' % (data.shape[0]))
data2 = data.dropna()
print('Number of rows after discarding missing values = %d' % (data2.shape[0]))

Number of rows in original data = 3755
Number of rows after discarding missing values = 3755


# Sampling

In [251]:
from sklearn.model_selection import train_test_split
import random

features = ['salary', 'salary_in_usd', 'remote_ratio'] #特徵列
data[features] = imputer.fit_transform(data[features]) #轉換數值

X = data[features]
y = data["salary_in_usd"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #test_size測試資料比例

In [252]:
# 使用 sample_without_replacement()
sampled_data = random.sample(list(X_train.index), k=100)  #100個樣本

# 取樣
sampled_X_train = X_train.loc[sampled_data]
sampled_y_train = y_train.loc[sampled_data]

sampled_X_train.head()

Unnamed: 0,salary,salary_in_usd,remote_ratio
1013,247500.0,247500.0,0.0
2057,81666.0,81666.0,100.0
606,52500.0,52500.0,100.0
2428,170000.0,170000.0,100.0
894,120000.0,120000.0,100.0


# Binarize

In [253]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=100000) #二元化分界
binary_data = binarizer.transform(data['salary_in_usd'].values.reshape(-1, 1))
data['salary_binary'] = binary_data

print(binary_data)

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [0.]]


In [254]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

categorical_columns = ['salary_currency']
encoded_data = encoder.fit_transform(data[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)

print(encoded_data)

  (0, 7)	1.0
  (1, 19)	1.0
  (2, 19)	1.0
  (3, 19)	1.0
  (4, 19)	1.0
  (5, 19)	1.0
  (6, 19)	1.0
  (7, 19)	1.0
  (8, 19)	1.0
  (9, 19)	1.0
  (10, 19)	1.0
  (11, 19)	1.0
  (12, 19)	1.0
  (13, 19)	1.0
  (14, 19)	1.0
  (15, 19)	1.0
  (16, 19)	1.0
  (17, 19)	1.0
  (18, 19)	1.0
  (19, 19)	1.0
  (20, 19)	1.0
  (21, 19)	1.0
  (22, 19)	1.0
  (23, 19)	1.0
  (24, 19)	1.0
  :	:
  (3730, 7)	1.0
  (3731, 7)	1.0
  (3732, 19)	1.0
  (3733, 19)	1.0
  (3734, 12)	1.0
  (3735, 19)	1.0
  (3736, 19)	1.0
  (3737, 19)	1.0
  (3738, 19)	1.0
  (3739, 7)	1.0
  (3740, 19)	1.0
  (3741, 19)	1.0
  (3742, 19)	1.0
  (3743, 19)	1.0
  (3744, 7)	1.0
  (3745, 19)	1.0
  (3746, 16)	1.0
  (3747, 19)	1.0
  (3748, 7)	1.0
  (3749, 19)	1.0
  (3750, 19)	1.0
  (3751, 19)	1.0
  (3752, 19)	1.0
  (3753, 19)	1.0
  (3754, 12)	1.0


# Discretization

In [255]:
from sklearn.preprocessing import KBinsDiscretizer

n_bins = 5  # 分箱數
strategy = 'uniform'  # 分箱策略: uniform、quantile、kmeans
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='onehot', strategy=strategy)

# 選擇要分箱的特徵列
feature_to_discretize = ['salary_in_usd']

data_binned = discretizer.fit_transform(data[feature_to_discretize])
feature_names = discretizer.get_feature_names_out(feature_to_discretize)

print(data_binned)

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 2)	1.0
  (6, 1)	1.0
  (7, 2)	1.0
  (8, 1)	1.0
  (9, 1)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 2)	1.0
  (14, 1)	1.0
  (15, 1)	1.0
  (16, 0)	1.0
  (17, 1)	1.0
  (18, 1)	1.0
  (19, 1)	1.0
  (20, 1)	1.0
  (21, 3)	1.0
  (22, 1)	1.0
  (23, 2)	1.0
  (24, 1)	1.0
  :	:
  (3730, 0)	1.0
  (3731, 0)	1.0
  (3732, 1)	1.0
  (3733, 2)	1.0
  (3734, 0)	1.0
  (3735, 0)	1.0
  (3736, 0)	1.0
  (3737, 0)	1.0
  (3738, 0)	1.0
  (3739, 0)	1.0
  (3740, 2)	1.0
  (3741, 1)	1.0
  (3742, 1)	1.0
  (3743, 1)	1.0
  (3744, 0)	1.0
  (3745, 1)	1.0
  (3746, 1)	1.0
  (3747, 4)	1.0
  (3748, 0)	1.0
  (3749, 1)	1.0
  (3750, 4)	1.0
  (3751, 1)	1.0
  (3752, 1)	1.0
  (3753, 1)	1.0
  (3754, 1)	1.0


# Standardize

In [256]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

feature_to_scale = ['salary_in_usd']
data[feature_to_scale] = scaler.fit_transform(data[feature_to_scale])

print(data[feature_to_scale])

      salary_in_usd
0         -0.820391
1         -1.706187
2         -1.777563
3          0.593676
4         -0.278686
...             ...
3750       4.352762
3751       0.213009
3752      -0.516603
3753      -0.595909
3754      -0.680528

[3755 rows x 1 columns]


# Normalize

In [257]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

feature_to_scale = ['salary_in_usd']
data[feature_to_scale] = scaler.fit_transform(data[feature_to_scale])

print(data[feature_to_scale])

      salary_in_usd
0          0.181436
1          0.055900
2          0.045784
3          0.381839
4          0.258207
...             ...
3750       0.914581
3751       0.327891
3752       0.224489
3753       0.213250
3754       0.201257

[3755 rows x 1 columns]


# Dimension reduction

In [258]:
from sklearn.decomposition import PCA

features_to_pca = ['salary', 'remote_ratio']

n_components = 2  # 指定降维后的维度
pca = PCA(n_components=n_components)

data_pca = pca.fit_transform(data[features_to_pca])

data[['PCA_1', 'PCA_2']] = data_pca

data.drop(features_to_pca, axis=1, inplace=True)

print(data_pca)

[[-1.10695572e+05 -5.39584513e+01]
 [-1.60695572e+05 -5.40623941e+01]
 [-1.65195572e+05 -5.40717487e+01]
 ...
 [-8.56955717e+04 -5.39064836e+01]
 [-9.06955717e+04 -5.39168776e+01]
 [ 6.80930443e+06  1.04240061e+01]]


# Feature selection

In [269]:
from sklearn.feature_selection import VarianceThreshold


threshold = 0.01  # 方差閾值
variance_threshold = VarianceThreshold(threshold=threshold)
features_to_select = ['work_year', 'salary_in_usd']
selected_data = variance_threshold.fit_transform(data[features_to_select])
selected_feature_names = [feature for feature, is_selected in zip(features_to_select, variance_threshold.get_support()) if is_selected]
data_selected = pd.DataFrame(selected_data, columns=selected_feature_names)
data = pd.concat([data.drop(features_to_select, axis=1), data_selected], axis=1)

print(data_selected)

      work_year  salary_in_usd
0          2023          85847
1          2023          30000
2          2023          25500
3          2023         175000
4          2023         120000
...         ...            ...
3750       2020         412000
3751       2021         151000
3752       2020         105000
3753       2020         100000
3754       2021          94665

[3755 rows x 2 columns]


In [275]:
from sklearn.feature_selection import SelectKBest, f_regression
# 自變量和因變量
X = data[['work_year', 'salary_in_usd', 'remote_ratio']]
y = data['salary']

# 定義
k_best = SelectKBest(score_func=f_regression, k=2)  # 選擇相關性最高的2個

# 選擇
X_new = k_best.fit_transform(X, y)
selected_feature_indices = k_best.get_support(indices=True)
# 索引獲取
selected_features = X.columns[selected_feature_indices]

# 輸出
print(selected_features)

Index(['work_year', 'remote_ratio'], dtype='object')


In [277]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

X = data[['work_year', 'salary_in_usd', 'remote_ratio']]
y = data['salary']

estimator = LinearRegression()
rfe = RFE(estimator, n_features_to_select=1)  # 选择最佳1個特徵

X_new = rfe.fit_transform(X, y)

selected_feature_indices = rfe.support_
selected_features = X.columns[selected_feature_indices]

print(selected_features)

Index(['work_year'], dtype='object')
