In [27]:
import pandas as pd
import numpy as np

# Data Visualization
%pip install seaborn
%pip install plotly

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Model 
#import xgboost as xgb
from sklearn.svm import SVR
from tensorflow.keras import layers
from sklearn.tree import  DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model Performance
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


2023-06-27 14:41:11.808274: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-27 14:41:13.689066: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-27 14:41:13.691855: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv('./Data/ds_salaries.csv')

In [5]:
df.head()
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

## EDA

In [6]:
categorical = df.columns[df.dtypes== 'object']
labels = {}
for col in categorical:
    labels[col] = df[col].unique()
labels

{'experience_level': array(['SE', 'MI', 'EN', 'EX'], dtype=object),
 'employment_type': array(['FT', 'CT', 'FL', 'PT'], dtype=object),
 'job_title': array(['Principal Data Scientist', 'ML Engineer', 'Data Scientist',
        'Applied Scientist', 'Data Analyst', 'Data Modeler',
        'Research Engineer', 'Analytics Engineer',
        'Business Intelligence Engineer', 'Machine Learning Engineer',
        'Data Strategist', 'Data Engineer', 'Computer Vision Engineer',
        'Data Quality Analyst', 'Compliance Data Analyst',
        'Data Architect', 'Applied Machine Learning Engineer',
        'AI Developer', 'Research Scientist', 'Data Analytics Manager',
        'Business Data Analyst', 'Applied Data Scientist',
        'Staff Data Analyst', 'ETL Engineer', 'Data DevOps Engineer',
        'Head of Data', 'Data Science Manager', 'Data Manager',
        'Machine Learning Researcher', 'Big Data Engineer',
        'Data Specialist', 'Lead Data Analyst', 'BI Data Engineer',
        'Dire

In [7]:
numeric_cols=df.describe().columns


In [8]:
# Extract the value counts for years
data_values = df.work_year.value_counts()

# Create a pie chart
fig = go.Figure(data=go.Pie(
    labels=data_values.index, 
    values=data_values.values, 
    hole=0.4, 
    textinfo='label+percent',
    insidetextorientation='radial',
    marker=dict(
        colors=px.colors.sequential.RdBu, 
        line=dict(
            color='honeydew', 
            width=1
            
        )
    ),
))

# Update layout
fig.update_layout(
    title="Distribution of Data Entries across Years",
    annotations=[dict(text="Year Overview", showarrow=False, font_size=8)],
    height=350
)

# Show the pie chart
fig.show()

## Modeling

In [16]:
processed_df =  df.copy()

#keeping the categorical data
processed_df.drop(axis=1,columns=['work_year', 'job_title', 
                                  'salary_currency', 'salary',
                                    'employee_residence', 'company_location'], inplace=True)

processed_df.tail()

Unnamed: 0,experience_level,employment_type,salary_in_usd,remote_ratio,company_size
3750,SE,FT,412000,100,L
3751,MI,FT,151000,100,L
3752,EN,FT,105000,100,S
3753,EN,CT,100000,100,L
3754,SE,FT,94665,50,L


In [17]:
# Data Preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [50]:
#encode categorical data
# A list to store all the encoder for later use
categorical_cols = processed_df.select_dtypes(include=['object']).columns

cat_encoders = {}

for col in categorical_cols:
    encoder = OrdinalEncoder()
    processed_df[col] =  encoder.fit_transform(processed_df[[col]].to_numpy())
    cat_encoders[col] = encoder
    

    
# Final processed data
processed_df.head()

Unnamed: 0,experience_level,employment_type,salary_in_usd,remote_ratio,company_size
0,3.0,2.0,85847,100,0.0
1,2.0,0.0,30000,100,2.0
2,2.0,0.0,25500,100,2.0
3,3.0,2.0,175000,100,1.0
4,3.0,2.0,120000,100,1.0


In [66]:
#Escalamiento: Comparabilidad y reducción del sesgo

# Initialize Standard Scaler
scaler = StandardScaler()

# Splitted into features and tagert
mirror_data = processed_df.copy()
#mirror_data = mirror_data.drop(['employment_type', 'remote_ratio', 'company_size'], axis=1)
target = mirror_data.pop('salary_in_usd').to_numpy()
features = mirror_data.to_numpy()

# Apply standard scaler
scalled_data = scaler.fit_transform(features)
scalled_data

array([[  0.58573566,   0.02592668,   1.10591825,  -2.34302199],
       [ -0.51784558, -14.95171653,   1.10591825,   2.75877423],
       [ -0.51784558, -14.95171653,   1.10591825,   2.75877423],
       ...,
       [ -2.72500806,   0.02592668,   1.10591825,   2.75877423],
       [ -2.72500806, -14.95171653,   1.10591825,  -2.34302199],
       [  0.58573566,   0.02592668,   0.07674278,  -2.34302199]])

In [67]:
# Generate feature correlation
linear_corr = processed_df.corr(method='pearson').round(2)

# Visualize correlation
corr_heatmap = px.imshow(linear_corr,
                         text_auto=True,
                         labels=dict(x="Features", y="Features",
                                     color="Correlation"),
                         color_continuous_scale=px.colors.sequential.Aggrnyl,
                         title="Linear Correlation (Heatmap)",
                         height=400
                         )

corr_heatmap.update_layout(
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_title="Features",
    yaxis_title="Features",
    coloraxis_colorbar_title="Correlation"
)

corr_heatmap.show()

### Prepare data for processing

In [68]:
# Train & Test split
X_train, X_test, y_train, y_test = train_test_split(
    scalled_data, 
    target, 
    train_size=0.8, 
    test_size=0.2, 
    random_state=42
)

# Create a data frame to maintain performance record
performances = pd.DataFrame(columns=["Model_Name", "MSE", "RMSE", "MAE", "R2_Score"])

In [69]:
def compute_performance(model, test_data):
    """
    Compute performance metrics for a given model.

    Parameters:
        model (object): Trained regression model.
        test_data (tuple): Tuple containing test features (X) and test target (y).

    Returns:
        tuple: Tuple containing the calculated performance metrics (MSE, RMSE, MAE, R^2).

    """
    # Distribute data
    X, y = test_data
    
    # Make predictions
    model_predictions = model.predict(X)
    
    # Compute metrics
    mse = mean_squared_error(y, model_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, model_predictions)
    r2 = r2_score(y, model_predictions)
    
    # Return the performances
    return mse, rmse, mae, r2

In [73]:
from sklearn import svm
from sklearn.pipeline import make_pipeline


clf = svm.SVC()

clf = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='linear'))

clf.fit(X_train,y_train)

In [74]:

# Make predictions on the test data
svr_predictions = clf.predict(X_test)

# Calculate performance metrics
svr_mse, svr_rmse, svr_mae, svr_r2 = compute_performance(model=clf, test_data=[X_test, y_test])

# Store the performance metrics in a DataFrame
performances.loc[1] = ["SVR", svr_mse, svr_rmse, svr_mae, svr_r2]

# Print the performance metrics
print(f"Mean Squared Error: {svr_mse:.2f}")
print(f"Root Mean Squared Error: {svr_rmse:.2f}")
print(f"Mean Absolute Error: {svr_mae:.2f}")
print(f"R-squared: {svr_r2:.2f}")

Mean Squared Error: 4040254772.02
Root Mean Squared Error: 63563.00
Mean Absolute Error: 48893.19
R-squared: -0.02


In [75]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, svr_predictions))

0.03595206391478029


https://www.kaggle.com/code/utkarshsaxenadn/data-science-salaries-eda-data-analysis