In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [8]:
df = pd.read_csv('C:/Users/xy200/OneDrive/Desktop/24sping/ds bootcamp/homework6/employee.csv')
print(df.columns)

# print 3 info to tell the accuracy of model, use later
max_salary = df['salary'].max()
min_salary = df['salary'].min()
salary_range = max_salary - min_salary

print(f"max salary: {max_salary}")
print(f"min salary: {min_salary}")
print(f"range of salary: {salary_range}")

# Define the preprocessing for the categorical columns
categorical_features = ['country', 'employment_status', 'job_title', 'is_manager', 'education', 
                        'is_education_computer_related', 'certifications']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessing for the numerical features
numeric_features = ['job_years', 'hours_per_week', 'telecommute_days_per_week']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine preprocessing for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the preprocessing and training pipeline for linear regression
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])

# define features and target variable, assuming 'salary' I want to predict
features = df.drop('salary', axis=1)
target = df['salary']

# Split the data into training and test sets
# I process small dataset, so use 30% data for test, not 20%, so set up test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

# Fit the pipeline with the training data
pipeline_lr.fit(X_train, y_train)

# Predict the test data
y_pred = pipeline_lr.predict(X_test)

# Compute the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Linear Regression MAE: {mae}")
print(f"Linear Regression MSE: {mse}")

Index(['id', 'timestamp', 'country', 'employment_status', 'job_title',
       'job_years', 'is_manager', 'hours_per_week',
       'telecommute_days_per_week', 'education',
       'is_education_computer_related', 'certifications', 'salary'],
      dtype='object')
max salary: 10625.0
min salary: 3622.0
range of salary: 7003.0
Linear Regression MAE: 877.2343520345843
Linear Regression MSE: 1381738.0343896605


In [9]:
# To use Ridge or Lasso, replace LinearRegression() in the pipeline with Ridge() or Lasso()
# For example:
pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', Ridge())])

pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', Lasso())])

# Fit and predict with Ridge
pipeline_ridge.fit(X_train, y_train)
y_pred_ridge = pipeline_ridge.predict(X_test)

# Compute the evaluation metrics for Ridge
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Fit and predict with Lasso
pipeline_lasso.fit(X_train, y_train)
y_pred_lasso = pipeline_lasso.predict(X_test)

# Compute the evaluation metrics for Lasso
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f"Ridge Regression MAE: {mae_ridge}")
print(f"Ridge Regression MSE: {mse_ridge}")
print(f"Lasso Regression MAE: {mae_lasso}")
print(f"Lasso Regression MSE: {mse_lasso}")

Ridge Regression MAE: 871.5186719062051
Ridge Regression MSE: 1363719.2100053956
Lasso Regression MAE: 859.9537354716244
Lasso Regression MSE: 1339847.0283128652


Lasso regression MAE is the lowest, it's relatively the most accurate one
For Q3, the ridge regression and lasso regression perform better than linear model.