In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('predicting_wages.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3835 entries, 0 to 3834
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   female  3835 non-null   int64  
 1   cg      3835 non-null   int64  
 2   sc      3835 non-null   int64  
 3   hsg     3835 non-null   int64  
 4   mw      3835 non-null   int64  
 5   so      3835 non-null   int64  
 6   we      3835 non-null   int64  
 7   ne      3835 non-null   int64  
 8   exp1    3835 non-null   float64
 9   exp2    3835 non-null   float64
 10  exp3    3835 non-null   float64
 11  wage    3835 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 359.7 KB


In [5]:
df.head()

Unnamed: 0,female,cg,sc,hsg,mw,so,we,ne,exp1,exp2,exp3,wage
0,0,0,0,1,0,0,0,1,33.0,10.89,35.937,11.659091
1,0,1,0,0,0,0,0,1,27.0,7.29,19.683,12.825
2,0,0,1,0,0,0,0,1,13.0,1.69,2.197,5.777027
3,0,1,0,0,0,0,0,1,2.0,0.04,0.008,12.46875
4,1,1,0,0,0,0,0,1,15.0,2.25,3.375,18.525


# Feature Engineering


In [7]:
X = df.drop(['wage'],axis = 1)
Y =df['wage']
print(X.shape)
print(Y.shape)

(3835, 11)
(3835,)


In [8]:
#function to calculate model performance using Rsquared and RMSE
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_predictions(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, rmse

# Splitting the dataset and Scaling 

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

#splitting into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 1)

#checking the shape of the train and test set
print('Train Shape:',x_train.shape)
print('Test Shape:',x_test.shape)

Train Shape: (3068, 11)
Test Shape: (767, 11)


In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


# Fitting a Linear Regression model

In [50]:
from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()

LinReg.fit(x_train_scaled, y_train)

# Make predictions on the test data
y_pred =LinReg.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  14.574557857992954
R-squared:  0.08517872503108381


# Random Forest

In [46]:
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
# Random Forest Regressor
regressor = RandomForestRegressor(n_estimators = 2000,max_depth=2)

# Fitting the model
regressor.fit(x_train_scaled, y_train)

# Make predictions on the test data
y_pred = regressor.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  15.111752552907818
R-squared:  0.016498229791426677


# Gradient Boosting

In [33]:
#fitting the gradient boost regressor 
from sklearn.ensemble import GradientBoostingRegressor

GBreg = GradientBoostingRegressor(n_estimators = 100, random_state = 1, learning_rate=0.1, max_depth=3)

GBreg.fit(x_train_scaled, y_train)

# Make predictions on the test data
y_pred = GBreg.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  14.734407778927913
R-squared:  0.06500164130141306


# XGB Regressor

In [34]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)
#fitting the model
xgb_model.fit(x_train_scaled, y_train)
# Make predictions on the test data

y_pred = xgb_model.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  14.7395843239135
R-squared:  0.06434455194272415


# LightGBM

In [35]:
#fitting lightGBM
import lightgbm as lgb

# Define the dataset format for LightGBM
train_data = lgb.Dataset(x_train_scaled, label=y_train)

# Set the hyperparameters for the model
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'mae',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 1
}
#training the model
lgb_model = lgb.train(params, train_data, num_boost_round=100)

y_pred = lgb_model.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  15.376607404105817
R-squared:  -0.018278398546124786


# Catboost

In [20]:
!pip install catboost 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [36]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000, learning_rate=0.1, loss_function='RMSE')
model.fit(x_train_scaled, y_train, verbose=False)

y_pred = model.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  15.90267504847223
R-squared:  -0.0891453665137758


# DecisionTree Regressor

In [48]:
from sklearn.tree import DecisionTreeRegressor
# decision tree regresison on original data
dt = DecisionTreeRegressor()
dt.fit(x_train_scaled,y_train)
dt_pruned = DecisionTreeRegressor(max_depth=5)
dt_pruned.fit(x_train_scaled,y_train)

y_pred = dt_pruned.predict(x_test_scaled)
# Calculate the RMSE and R-squared on the testing data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print the results
print("RMSE: ", rmse)
print("R-squared: ", r2)

RMSE:  14.895885440001628
R-squared:  0.044395632633592697


# Cross Validation on Linear Regression

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

LinReg = LinearRegression()

# Perform cross-validation with 5 folds
scoring = ['neg_mean_squared_error', 'r2']
cv_scores = cross_validate(LinReg, x_train_scaled, y_train, cv=10, scoring=scoring)

# Calculate the RMSE and R-squared from the cross-validation scores
rmse_cv = np.sqrt(-cv_scores['test_neg_mean_squared_error'].mean())
r2_cv = cv_scores['test_r2'].mean()

# Print the results
print("Cross-validation RMSE: ", rmse_cv)
print("Cross-validation R-squared: ", r2_cv)

Cross-validation RMSE:  12.433564597565878
Cross-validation R-squared:  0.11489705925835722


The linear regression model performs best out of all models since it has the lowest RMSE and the and highest Rsquared