In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("wage.csv")

In [5]:
df.head()

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154


In [6]:
df.columns

Index(['year', 'age', 'maritl', 'race', 'education', 'region', 'jobclass',
       'health', 'health_ins', 'logwage', 'wage'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        3000 non-null   int64  
 1   age         3000 non-null   int64  
 2   maritl      3000 non-null   object 
 3   race        3000 non-null   object 
 4   education   3000 non-null   object 
 5   region      3000 non-null   object 
 6   jobclass    3000 non-null   object 
 7   health      3000 non-null   object 
 8   health_ins  3000 non-null   object 
 9   logwage     3000 non-null   float64
 10  wage        3000 non-null   float64
dtypes: float64(2), int64(2), object(7)
memory usage: 257.9+ KB


In [15]:
categorical_columns = ['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']
data_encoded = pd.get_dummies(df, columns=categorical_columns)

data_encoded.head()

Unnamed: 0,year,age,logwage,wage,maritl_1. Never Married,maritl_2. Married,maritl_3. Widowed,maritl_4. Divorced,maritl_5. Separated,race_1. White,...,education_3. Some College,education_4. College Grad,education_5. Advanced Degree,region_2. Middle Atlantic,jobclass_1. Industrial,jobclass_2. Information,health_1. <=Good,health_2. >=Very Good,health_ins_1. Yes,health_ins_2. No
0,2006,18,4.318063,75.043154,1,0,0,0,0,1,...,0,0,0,1,1,0,1,0,0,1
1,2004,24,4.255273,70.47602,1,0,0,0,0,1,...,0,1,0,1,0,1,0,1,0,1
2,2003,45,4.875061,130.982177,0,1,0,0,0,1,...,1,0,0,1,1,0,1,0,1,0
3,2003,43,5.041393,154.685293,0,1,0,0,0,0,...,0,1,0,1,0,1,0,1,1,0
4,2005,50,4.318063,75.043154,0,0,0,1,0,1,...,0,0,0,1,0,1,1,0,1,0


In [17]:
data_encoded.columns

Index(['year', 'age', 'logwage', 'wage', 'maritl_1. Never Married',
       'maritl_2. Married', 'maritl_3. Widowed', 'maritl_4. Divorced',
       'maritl_5. Separated', 'race_1. White', 'race_2. Black',
       'race_3. Asian', 'race_4. Other', 'education_1. < HS Grad',
       'education_2. HS Grad', 'education_3. Some College',
       'education_4. College Grad', 'education_5. Advanced Degree',
       'region_2. Middle Atlantic', 'jobclass_1. Industrial',
       'jobclass_2. Information', 'health_1. <=Good', 'health_2. >=Very Good',
       'health_ins_1. Yes', 'health_ins_2. No'],
      dtype='object')

### Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [19]:
X = data_encoded[['year', 'age', 'maritl_1. Never Married',
       'maritl_2. Married', 'maritl_3. Widowed', 'maritl_4. Divorced',
       'maritl_5. Separated', 'race_1. White', 'race_2. Black',
       'race_3. Asian', 'race_4. Other', 'education_1. < HS Grad',
       'education_2. HS Grad', 'education_3. Some College',
       'education_4. College Grad', 'education_5. Advanced Degree',
       'region_2. Middle Atlantic', 'jobclass_1. Industrial',
       'jobclass_2. Information', 'health_1. <=Good', 'health_2. >=Very Good',
       'health_ins_1. Yes', 'health_ins_2. No']]
y = data_encoded['logwage']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Define the regression algorithms
regressors = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest Regression": RandomForestRegressor()
}
# Define hyperparameters for grid search
param_grid = {
    "Linear Regression": {},
    "Ridge Regression": {"alpha": [0.1, 1.0, 10.0]},
    "Lasso Regression": {"alpha": [0.1, 1.0, 10.0]},
    "Random Forest Regression": {"n_estimators": [100, 200, 300]}
}


In [21]:
# Perform hyperparameter tuning and evaluation for each algorithm
for name, regressor in regressors.items():
    print("Algorithm:", name)
    
    # Hyperparameter tuning using grid search
    grid_search = GridSearchCV(regressor, param_grid[name], cv=5, scoring="neg_mean_absolute_error")
    grid_search.fit(X_train, y_train)
    best_estimator = grid_search.best_estimator_
    
    print("Best Hyperparameters:", grid_search.best_params_)
    
    # Cross-validation
    cv_scores = cross_val_score(best_estimator, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
    cv_mae = -cv_scores.mean()
    print("Cross-Validation MAE:", cv_mae)
    
    # Fit the model
    best_estimator.fit(X_train, y_train)
    
    # Make predictions
    y_pred = best_estimator.predict(X_test)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R^2) Score:", r2)
    print("----------------------------------------")

Algorithm: Linear Regression
Best Hyperparameters: {}
Cross-Validation MAE: 0.20645623421065623
Mean Absolute Error (MAE): 0.19368169907828042
Mean Squared Error (MSE): 0.06422688492624015
Root Mean Squared Error (RMSE): 0.25343023680342514
R-squared (R^2) Score: 0.415817587474118
----------------------------------------
Algorithm: Ridge Regression
Best Hyperparameters: {'alpha': 10.0}
Cross-Validation MAE: 0.20618902848291323
Mean Absolute Error (MAE): 0.1939528743870088
Mean Squared Error (MSE): 0.06438515797763313
Root Mean Squared Error (RMSE): 0.25374230624323
R-squared (R^2) Score: 0.41437799822567856
----------------------------------------
Algorithm: Lasso Regression
Best Hyperparameters: {'alpha': 0.1}
Cross-Validation MAE: 0.2577572966746864
Mean Absolute Error (MAE): 0.25401352573627906
Mean Squared Error (MSE): 0.10625281202059979
Root Mean Squared Error (RMSE): 0.32596443367428873
R-squared (R^2) Score: 0.03356633074861071
----------------------------------------
Algorithm

Best Model = Linear Regression