# Part 1 -- Introduction -- Classification with iris dataset
based on https://machinelearningmastery.com/machine-learning-in-python-step-by-step/

In [None]:
# Check the versions of libraries
    
# Python version
import sys
print(f"Python: {sys.version}")

# scipy
import scipy
print(f"scipy: {scipy.__version__}")

# numpy
import numpy as np
print(f"numpy: {np.__version__}")

# matplotlib
import matplotlib
import matplotlib.pyplot as plt
print(f"matplotlib: {matplotlib.__version__}")

#seaborn
import seaborn as sns
sns.set(context="notebook")
print(f"seaborn: {sns.__version__}")

# pandas
import pandas as pd
print(f"pandas: {pd.__version__}")

# scikit-learn
import sklearn
print(f"sklearn: {sklearn.__version__}")

## Step 1 -- Get Data

In [None]:
# Load dataset
# also available in scikit learn: from sklearn.datasets import load_iris
data = "data/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
df = pd.read_csv(data, names=names)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15, 8))

sns.scatterplot(x="sepal-length", y="sepal-width", hue="class", s=80, data=df)

In [None]:
df.info()

In [None]:
df.shape, df.ndim

## Step 2 -- Familiarize yourself with the data and data preparation 

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df['class'].unique()

In [None]:
df["class"].value_counts()

In [None]:
sns.pairplot(df, hue="class")

In [None]:
df.boxplot(figsize=(15,8))

In [None]:
for feature in df.columns[:-1]:
    fig, ax = plt.subplots(1, 4, figsize=(15,5))
    
    sns.boxplot(x="class", y=feature, data=df, ax=ax[0])
    
    for i in range(3):
        sns.distplot(df[df["class"] == df["class"].unique()[i]][feature], kde=True, color=["b","orange","green"][i], ax=ax[i+1])

## Step 3 -- Train Model

In [None]:
# Split-out validation dataset
from sklearn.model_selection import train_test_split

features = ["petal-length", "petal-width"]
features = ["sepal-length", "sepal-width", "petal-length", "petal-width"]
X = df[features]
y = df["class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
Y_train.value_counts()

In [None]:
def train_model(model) -> list:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    
    return cv_results

In [None]:
# Spot Check Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score

models = {}
models['LR'] = LogisticRegression(solver='liblinear', multi_class='ovr', fit_intercept=True)
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier(3)
models['CART'] = DecisionTreeClassifier()
models['NB'] = GaussianNB()
models['SVM'] = SVC(gamma='auto')

# evaluate each model in turn
results = []
names = []
for name, model in models.items():
    cv_results = train_model(model)
    results.append(cv_results)
    names.append(name)
    print(f'{name} acc: {round(cv_results.mean(), 3)} (std: {round(cv_results.std(), 3)})')

In [None]:
plt.figure(figsize=(15,8))

plt.bar(names, np.mean(results, axis=1))
plt.ylim([0.8, 1])

plt.title("Training score of 6 different algorithms on iris dataset")

None

In [None]:
# Compare Algorithms
plt.figure(figsize=(15,8))

plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')

None

In [None]:
help(models['LR'])

In [None]:
model = LogisticRegression(solver='liblinear', multi_class='ovr').fit(X_train, Y_train)

pd.DataFrame(model.coef_.reshape(-1, 3), columns=[model.classes_], index=features)

In [None]:
model.classes_

## Step 4 -- Test Data

In [None]:
# take best model and test it on the holdout test set
model = SVC(gamma='auto').fit(X_train, Y_train)

model.score(X_test, Y_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
Y_predictions = model.predict(X_test)

# Evaluation
print(f"Accuracy on test set: {round(accuracy_score(Y_test, Y_predictions), 3)}")
print(f"Confusion matrix:\n {confusion_matrix(Y_test, Y_predictions)}")
print(f"Classification report:\n {classification_report(Y_test, Y_predictions)}")

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

# train on 2D data for plot
# try different kernels: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
model = SVC(kernel='rbf', degree=1, gamma='auto').fit(X_train.values[:, 2:], Y_train)

# create a mesh to plot in
x_min, x_max = X_train.values[:, 2].min() - 1, X_train.values[:, 2].max() + 1
y_min, y_max = X_train.values[:, 3].min() - 1, X_train.values[:, 3].max() + 1

h = .02
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
plt.figure(figsize=(15, 8))

Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = lb.fit_transform(Z)

# Put the result into a color plot
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.5)

# Plot also the training points
plt.scatter(X_train.values[:, 2], X_train.values[:, 3], c=lb.fit_transform(Y_train), s=50)
plt.xlabel(features[2])
plt.ylabel(features[3])
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

# Part 2 -- Regression
data: https://stats.oecd.org/Index.aspx?DataSetCode=BLI#

In [None]:
oecd_bli = pd.read_csv("data/oecd_bli_2020.csv")

oecd_bli.head()

In [None]:
oecd_bli.Measure.unique()

In [None]:
oecd_bli[oecd_bli.Country == 'Australia']

In [None]:
oecd_bli[(oecd_bli.Country == 'Australia') & (oecd_bli.Inequality == "Total")]

In [None]:
oecd_bli_pivot = oecd_bli[oecd_bli.Inequality == "Total"].pivot(index="Country", columns="Indicator", values="Value")

oecd_bli_pivot.head()

In [None]:
oecd_bli_pivot.columns

In [None]:
oecd_bli_pivot["Life satisfaction"]

In [None]:
plt.figure(figsize=(15,8))

chart = sns.barplot(x='Country', y='Life satisfaction', data=oecd_bli_pivot.reset_index().sort_values('Life satisfaction', ascending=False))

chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

None

In [None]:
X = oecd_bli_pivot.drop(columns=["Life satisfaction"])
y = oecd_bli_pivot["Life satisfaction"]

X.shape, y.shape

In [None]:
plt.figure(figsize=(15, 8))

plt.plot(X['Household net adjusted disposable income'], y, ".", markersize=20)

plt.xlabel("Household net adjusted disposable income")
plt.ylabel("Life satisfaction")

In [None]:
oecd_bli_pivot[['Household net adjusted disposable income', 'Life satisfaction']].corr()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

clf = make_pipeline(
    sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean'), 
    sklearn.preprocessing.StandardScaler(), 
    #sklearn.ensemble.BaggingRegressor(base_estimator=sklearn.svm.SVR()),
    #sklearn.ensemble.GradientBoostingRegressor(loss='ls', n_estimators=20, max_depth=2, learning_rate=0.1),
    #sklearn.neighbors.KNeighborsRegressor(n_neighbors=3),
    sklearn.linear_model.Ridge()
)

scores = cross_val_score(clf, X, y, cv=10)

scores.mean(), scores.std()

In [None]:
model = sklearn.linear_model.Ridge()
imp = sklearn.impute.SimpleImputer()
mms = sklearn.preprocessing.StandardScaler()

imp.fit(X_train)
mms.fit(imp.transform(X_train))

model.fit(mms.transform(imp.transform(X_train)), y_train)

y_pred = model.predict(mms.transform(imp.transform(X_test)))

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

In [None]:
plt.figure(figsize=(15, 8))

plt.bar(X_test.index, (y_test - y_pred)**2)

plt.ylabel('squared error')

In [None]:
plt.figure(figsize=(15, 8))

plt.plot(X_test['Household net adjusted disposable income'], y_test, ".", markersize=20, label='true')
plt.plot(X_test['Household net adjusted disposable income'], y_pred, ".", markersize=10, label='pred')

for i, e in enumerate(X_test.iterrows()):
    country, row = e
    x = row['Household net adjusted disposable income']
    y = y_test[country]
    
    plt.annotate(country, xy=(x,y), xytext=(0, 10), textcoords="offset points", ha='center')
    plt.plot([x, x], [y, y_pred[i]], color="red", zorder=0, linewidth=1)

plt.xlabel("Household net adjusted disposable income")
plt.ylabel("Life satisfaction")
plt.legend(loc="best")

## Step 5 -- Improve Model

In [None]:
# Compute the correlation matrix
corr = oecd_bli_pivot.corr()

# Generate a mask for the upper triangle
#mask = np.triu(np.ones_like(corr, dtype=np.bool))
mask = np.diag(np.ones_like(corr.iloc[:,0]))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr['Life satisfaction'].sort_values(ascending=False)

In [None]:
# Compute the correlation matrix
corr = oecd_bli_pivot.corr(method='spearman')

# Generate a mask for the upper triangle
#mask = np.triu(np.ones_like(corr, dtype=np.bool))
mask = np.diag(np.ones_like(corr.iloc[:,0]))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr['Life satisfaction'].sort_values(ascending=False)

In [None]:
model = sklearn.linear_model.Ridge()
imp = sklearn.impute.SimpleImputer()
mms = sklearn.preprocessing.StandardScaler()

imp.fit(X_train[features])
mms.fit(imp.transform(X_train[features]))

model.fit(mms.transform(imp.transform(X_train[features])), y_train)

y_pred = model.predict(mms.transform(imp.transform(X_test[features])))

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

In [None]:
plt.figure(figsize=(15, 8))

plt.bar(X_test.index, (y_test - y_pred)**2)

plt.ylabel('squared error')

In [None]:
for feature in features:
    plt.figure(figsize=(15, 8))

    plt.plot(X_test[feature], y_test, ".", markersize=20, label='true')
    plt.plot(X_test[feature], y_pred, ".", markersize=10, label='pred')

    for i, e in enumerate(X_test.iterrows()):
        country, row = e
        x = row[feature]
        y = y_test[country]

        plt.annotate(country, xy=(x,y), xytext=(0, 10), textcoords="offset points", ha='center')
        plt.plot([x, x], [y, y_pred[i]], color="red", zorder=0, linewidth=1)

    plt.xlabel(feature)
    plt.ylabel("Life satisfaction")
    plt.legend(loc="best")