In [None]:
greetings = "Assalam-o-Alaikum!"
print(greetings)

### Import Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, FunctionTransformer 
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import mean_squared_error as MSE

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

### Import Dataset

In [None]:
df = pd.read_csv("housing.csv")
df[::200]

### Dataset Information

In [None]:
df.info()

In [None]:
# Statistics
df.describe()

In [None]:
# Buvariate Analysis
plt.figure(figsize = (18, 7))
correlation = df.corr()
sns.heatmap(correlation, annot = True, cmap = 'coolwarm')
plt.title("Correlation", size = 15)
plt.show()

**Split Data into Features and Target**

In [None]:
#Features
X = df[df.columns.drop("MEDV")]

# Target
y = df["MEDV"]

### Pipelines using sklearn

In [None]:
rf_pipeline = Pipeline([("scaler", StandardScaler()),
                        ("model", RandomForestRegressor())])

scores = cross_val_score(rf_pipeline, 
                         X,
                         y,
                         scoring = "neg_mean_squared_error",
                         cv = 10)

final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final RMSE:", final_avg_rmse)

In [None]:
# Split Data into Training and Test Set
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 2)

# Fit Data into Model for Training
rf_pipeline.fit(x_train, y_train)

# Prediction
y_pred = rf_pipeline.predict(x_test)
pd.DataFrame({"Actual": y_test, "Predicted": y_pred})[::20]

In [None]:
plt.figure(figsize = (18, 7))
sns.kdeplot(x = y_test, label = "Actual", color = "yellow", linewidth = 3)
sns.kdeplot(y_pred, label = "Predicted", color = "orange", linewidth = 3)
plt.title("Comparison of Actual and Predicted Values", size = 15)
plt.xlabel("Outcome", size = 15)
plt.ylabel("Density", size = 15)
plt.show()

### Incorporating XGBoost into pipelines

In [None]:
df = pd.read_csv("housing.csv")

X = df[df.columns.drop("MEDV")]
y = df["MEDV"]

xgb_pipeline = Pipeline([("st_scaler", StandardScaler()),
                         ("xgb_model", xgb.XGBRegressor())])

scores = cross_val_score(xgb_pipeline,
                         X,
                         y,
                         scoring = "neg_mean_squared_error",
                         cv = 10)

final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))

print("Final XGB RMSE:", final_avg_rmse)

### Tuning XGBoost hyperparameters in a pipeline

In [None]:
df = pd.read_csv("housing.csv")

X = df[df.columns.drop("MEDV")]
y = df["MEDV"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 2)
xgb_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("xgb_model", xgb.XGBRegressor())])

gbm_param_grid = {"xgb_model__subsample": np.arange(0.05, 1, 0.05),
                  "xgb_model__max_depth": np.arange(3, 20, 1),
                  "xgb_model__colsample_bytree": np.arange(0.1, 1.05, 0.05)}

randomized_neg_mse = RandomizedSearchCV(estimator = xgb_pipeline,
                                        param_distributions = gbm_param_grid,
                                        n_iter = 10,
                                        scoring = "neg_mean_squared_error",
                                        cv = 4)

randomized_neg_mse.fit(X_train, y_train)

y_pred = randomized_neg_mse.predict(X_test)
pd.DataFrame({"Actual": y_test, "Predicted": y_pred})[::20]

In [None]:
plt.figure(figsize = (18, 7))
sns.kdeplot(x = y_test, label = "Actual", color = "lightgreen", linewidth = 3)
sns.kdeplot(y_pred, label = "Predicted", color = "green", linewidth = 3)
plt.title("Comparison of Actual and Predicted Values", size = 15)
plt.xlabel("Outcome", size = 15)
plt.ylabel("Density", size = 15)
plt.show()

In [None]:
rmse = MSE(y_test, y_pred)**(1/2)
rmse

### Import Ames Unprocessed Data

In [None]:
df = pd.read_csv("ames_unprocessed_data.csv")
df[::400]

### Data Preparation

In [None]:
df.info()

In [None]:
# Fill Null values with 0
df["LotFrontage"] = df["LotFrontage"].fillna(0)

### Preprocessing within a pipeline

In [None]:
# Features
X = df[df.columns.drop("SalePrice")]

# Target
y = df["SalePrice"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 3)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse = False)),
         ("xgb_model", xgb.XGBRegressor())]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)


# Fit the pipeline
xgb_pipeline.fit(X_train.to_dict("records"), y_train)

y_pred = xgb_pipeline.predict(X_test.to_dict("records"))
pd.DataFrame({"Actual": y_test, "Predicted": y_pred})[::80]

In [None]:
plt.figure(figsize = (18, 7))
sns.kdeplot(x = y_test, label = "Actual", color = "pink", linewidth = 5)
sns.kdeplot(y_pred, label = "Predicted", color = "red", linewidth = 3)
plt.title("Comparison of Actual and Predicted Values", size = 15)
plt.xlabel("Outcome", size = 15)
plt.ylabel("Density", size = 15)
plt.show()

In [None]:
RMSE = MSE(y_test, y_pred)**(1/2)
RMSE

### Cross-validating XGBoost model

In [None]:
# Fill LotFrontage missing values with 0
df["LotFrontage"] = df["LotFrontage"].fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse = False)),
         ("xgb_model", xgb.XGBRegressor(max_depth = 2,
                       objective = "reg:squarederror"))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val = cross_val_score(xgb_pipeline,
                            X.to_dict("records"),
                            y,
                            scoring = "neg_mean_squared_error",
                            cv = 10)
# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val))))

### Kidney disease case study I: Categorical & Numerical Imputer

In [122]:
df = pd.read_csv('chronic_kidney_disease.csv', na_values='?')
df[:3]

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,1,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,2,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,3,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd


In [None]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier())
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)

# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))