In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the provided dataset
input_file_path = '/mnt/data/salary_benchmarking_dataset.xlsx'
df = pd.read_excel("D:/E/Study Materials/Salary Benchmarking/salary_benchmarking_dataset.xlsx")

# Map 'Salary_Range' to numerical values
def map_salary_range(salary):
    mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    return mapping[salary]

df['Salary_Range_Numeric'] = df['Salary_Range'].map(map_salary_range)

# Prepare the data for dimensionality reduction
df_encoded = pd.get_dummies(df, columns=['Role', 'Location', 'Education_Level'], drop_first=True)
X = df_encoded.drop(['Salary_Range', 'Salary_Range_Numeric'], axis=1)
df


Unnamed: 0,Role,Location,Education_Level,Experience_Years,Salary_Range,Salary_Range_Numeric
0,Data Scientist,Remote,High School,6,Medium,1
1,Product Manager,New York,High School,19,Medium,1
2,HR Manager,Austin,Bachelor's,14,High,2
3,Data Scientist,Seattle,High School,10,Low,0
4,Marketing Specialist,San Francisco,Master's,7,Medium,1
...,...,...,...,...,...,...
995,Product Manager,Austin,High School,7,Medium,1
996,HR Manager,Austin,PhD,5,Medium,1
997,Data Scientist,New York,Master's,5,High,2
998,Software Engineer,Remote,PhD,7,High,2


In [4]:

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
num_components = 2  # Number of principal components to keep
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)

# Add PCA components to the original dataset
for i in range(num_components):
    df[f'PCA_Component_{i+1}'] = X_pca[:, i]



In [5]:
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance Ratio by Component: {explained_variance}")
print(f"Total Explained Variance: {sum(explained_variance):.2f}")

# Save the dataset with PCA components
df.to_excel("D:/E/Study Materials/Salary Benchmarking/salary_benchmarking_with_pca.xlsx", index=False)
print("Dataset with PCA components saved.")


Explained Variance Ratio by Component: [0.11925642 0.11640955]
Total Explained Variance: 0.24
Dataset with PCA components saved.
