In [None]:
# Daily Challenge: Data Handling and Analysis in Python


# What You Will Learn
# Advanced techniques for data normalization, reduction, and aggregation.
# Skills in gathering, exploring, integrating, and cleaning data using Python.
# Proficiency in using Pandas for complex data manipulation.


# Your Task
# Download and import the Data Science Job Salary dataset.
# Normalize the ‘salary’ column using Min-Max normalization which scales all salary values between 0 and 1.
# Implement dimensionality reduction like Principal Component Analysis (PCA) or t-SNE to reduce the number of features (columns) in the dataset.
# Group the dataset by the ‘experience_level’ column and calculate the average and median salary for each experience level (e.g., Junior, Mid-level, Senior).

# Hint :
# As a reminder, normalization is crucial when dealing with data that has different ranges. For example, salary data might have a wide range (e.g., from $20,000 to $200,000). By scaling the data using Min-Max normalization, you make sure that all salary values fall within a consistent range (0 to 1). This is particularly helpful when the data is going to be used in machine learning models, as some algorithms (like k-nearest neighbors or neural networks) perform better when features are normalized. It ensures that no single salary dominates the learning process, making the analysis more balanced.

# Dimensionality reduction helps simplify complex datasets by reducing the number of variables under consideration. This can make the data more manageable and help avoid the curse of dimensionality—a phenomenon where machine learning models struggle when dealing with high-dimensional data.
# PCA, for instance, helps in retaining the most important information (variance) from the dataset while reducing noise and redundancy.
# It can also speed up the training process for models and help in visualizing data in fewer dimensions.

# Aggregating data helps in understanding trends within subgroups of the dataset.
# Calculating average and median salaries for each experience level gives insights into the compensation distribution and disparities across different job levels. This kind of aggregation can help in answering business questions like “How does salary evolve with experience?” or “What is the salary distribution for senior-level roles?”

In [None]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

df = pd.read_csv('/Users/teitelbaumsair/Desktop/DI_Bootcamp/W4/D4/DAILY CHALLENGE/datascience_salaries.csv')
print(df.head())


   Unnamed: 0       job_title   job_type experience_level       location  \
0           0  Data scientist  Full Time           Senior  New York City   
1           2  Data scientist  Full Time           Senior         Boston   
2           3  Data scientist  Full Time           Senior         London   
3           4  Data scientist  Full Time           Senior         Boston   
4           5  Data scientist  Full Time           Senior  New York City   

  salary_currency  salary  
0             USD  149000  
1             USD  120000  
2             USD   68000  
3             USD  120000  
4             USD  149000  


In [None]:
scaler = MinMaxScaler()

if 'salary' in df.columns:
    df['salary_normalized'] = scaler.fit_transform(df[['salary']])
else:
    print("Error: 'salary' column not found in the dataset!")

print("\nSalary Normalized:")
print(df[['salary', 'salary_normalized']].head())


Salary Normalized:
   salary  salary_normalized
0  149000           0.601010
1  120000           0.454545
2   68000           0.191919
3  120000           0.454545
4  149000           0.601010


In [None]:
numeric_df = df.select_dtypes(include=[np.number])

if numeric_df.shape[1] > 1:
    pca = PCA(n_components=2)  
    pca_result = pca.fit_transform(numeric_df)
    
    df['PCA1'] = pca_result[:, 0]
    df['PCA2'] = pca_result[:, 1]
    
    print("\nPCA Results:")
    print(df[['PCA1', 'PCA2']].head())
else:
    print("Error: Not enough numeric features for PCA!")

NameError: name 'df' is not defined

In [6]:
if 'experience_level' in df.columns:
    salary_stats = df.groupby('experience_level')['salary'].agg(['mean', 'median']).reset_index()
    salary_stats.rename(columns={'mean': 'average_salary', 'median': 'median_salary'}, inplace=True)
    print("\nSalary Statistics by Experience Level:")
    print(salary_stats)
else:
    print("Error: 'experience_level' column not found in the dataset!")


Salary Statistics by Experience Level:
  experience_level  average_salary  median_salary
0            Entry    36111.111111        30000.0
1        Executive    76076.923077        46000.0
2              Mid    51786.885246        51000.0
3           Senior    75088.033012        68000.0
