In [None]:
# Common imports
import sklearn
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import ensemble

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
# Load the dataset from the subfolder specified in filepath.
filepath = "dataset/adult.data"
df = pd.read_table(filepath, sep=',')
no_columns = len(df.columns)
new_column_names = []
for column in range(no_columns):
    new_column_names.append(0 +(column+1))
df.columns = new_column_names
df.head()

In [None]:
#Rename all columns
df = df.rename(columns={1: 'age', 2: 'workclass', 3: 'fnlwgt', 4: 'education', 5: 'education-num', 6: 'marital-status', 7: 'occupation', 8: 'relationship', 9: 'race', 10: 'sex', 11: 'capital-gain', 12: 'capital-loss', 13: 'hours-per-week', 14: 'native-country', 15: 'the-label'})
df.info()

In [None]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

In [None]:
df = handle_non_numerical_data(df)
df.head()

In [None]:
corr_matrix = df.corr()
corr_matrix["the-label"].sort_values(ascending=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn. preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Do not touch the target attribute
targetAttributes = ["the-label"]

# Standardize all the feature attributes
scaledAttributes = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]

fullPipeline = ColumnTransformer([
    ("target", 'passthrough', targetAttributes),
    ("scaled", StandardScaler(), scaledAttributes)
])

dataPrepared = fullPipeline.fit_transform(df)

# Display the prepared dataset (remember that the fit_transform method returns a NumPy array).
dataPrepared

In [None]:
dataPrepared.shape

In [None]:
X = dataPrepared[:,0:4]
y = dataPrepared[:,4]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13
)

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

In [None]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_train, reg.predict(X_train))
print("The mean squared error (MSE) on train set: {:.4f}".format(mse))

In [None]:
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

In [None]:
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
    np.arange(params["n_estimators"]) + 1,
    reg.train_score_,
    "b-",
    label="Training Set Deviance",
)
plt.plot(
    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()