In [10]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("wage.csv")

In [3]:
df.head()

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154


In [4]:
df.columns

Index(['year', 'age', 'maritl', 'race', 'education', 'region', 'jobclass',
       'health', 'health_ins', 'logwage', 'wage'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        3000 non-null   int64  
 1   age         3000 non-null   int64  
 2   maritl      3000 non-null   object 
 3   race        3000 non-null   object 
 4   education   3000 non-null   object 
 5   region      3000 non-null   object 
 6   jobclass    3000 non-null   object 
 7   health      3000 non-null   object 
 8   health_ins  3000 non-null   object 
 9   logwage     3000 non-null   float64
 10  wage        3000 non-null   float64
dtypes: float64(2), int64(2), object(7)
memory usage: 257.9+ KB


### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
X = df[['year', 'age', 'maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']]
y = df['wage']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, columns=['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model and fit it to the training data
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test data and compute the RMSE and R2 score
y_pred = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': lr.coef_})
print(coefficients)
print("RMSE:", rmse)
print("R2 score:", r2)
print("Model performance: ", lr.score(X_test, y_test))

                        Variable  Coefficient
0                           year     1.200968
1                            age     0.339230
2        maritl_1. Never Married    -6.241468
3              maritl_2. Married     9.975757
4              maritl_3. Widowed    -4.936131
5             maritl_4. Divorced    -3.550390
6            maritl_5. Separated     4.752233
7                  race_1. White     3.810239
8                  race_2. Black    -2.295863
9                  race_3. Asian     0.850869
10                 race_4. Other    -2.365245
11        education_1. < HS Grad   -21.398640
12          education_2. HS Grad   -14.571051
13     education_3. Some College    -4.480829
14     education_4. College Grad     9.679306
15  education_5. Advanced Degree    30.771213
16     region_2. Middle Atlantic     0.000000
17        jobclass_1. Industrial    -1.794902
18       jobclass_2. Information     1.794902
19              health_1. <=Good    -3.532371
20         health_2. >=Very Good  

###  Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)
print("Model performance: ", rf.score(X_test, y_test))

Mean squared error: 1068.9972278076125
Model performance:  0.31431372205484875


### SVR

In [19]:
from sklearn.svm import SVR

In [20]:
# Train a support vector regression model with a radial basis function kernel
svr = SVR(kernel="rbf")
svr.fit(X_train, y_train)

# Test the accuracy of the model on the test set
score = svr.score(X_test, y_test)
print("R-squared value:", score)
print("Model performance: ", svr.score(X_test, y_test))

R-squared value: -0.021859321369520845
Model performance:  -0.021859321369520845
