In [None]:
%matplotlib notebook
import urllib.request
import gzip
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from numpy.random import seed
from tensorflow import set_random_seed

# set numpy seed for reproducibility
seed(1)
# set tf seed for reproducibility
set_random_seed(2)

In [None]:
def load_covtype_dataset():
    '''Downloads the Cover Type dataset from UCI repository, returning a file handle'''
    CURRENT_DIR = os.getcwd()
    COVTYPE_FILENAME = 'covtype.data'
    COVTYPE_DATA_PATH = os.path.join(CURRENT_DIR, COVTYPE_FILENAME)
    COVTYPE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
    if os.path.isfile(COVTYPE_DATA_PATH):
        print('Using local cached copy in', COVTYPE_DATA_PATH)
    else:
        print('Dataset not found locally. Downloading in', COVTYPE_DATA_PATH)
        with urllib.request.urlopen(COVTYPE_URL) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed, open(COVTYPE_DATA_PATH, 'wb') as out_file:
                file_header = uncompressed.read()
                out_file.write(file_header)
    return COVTYPE_DATA_PATH

In [None]:
covtype_file = load_covtype_dataset()

In [None]:
df_covtype = pd.read_csv(covtype_file, header=None)

In [None]:
df_covtype.describe()

In [None]:
def features_renaming(df_covtype):
    '''Rename each column to meaningful labels'''
    # First step: rename the first 14 columns
    first_fourteen_old_feature_names = df_covtype.columns[np.arange(0,14)]
    first_fourteen_new_feature_names = ['Elevation', 'Aspect', 'Slope',
                                        'Horizontal_Distance_To_Hydrology',
                                        'Vertical_Distance_To_Hydrology',
                                        'Horizontal_Distance_To_Roadways',
                                        'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                                        'Horizontal_Distance_To_Fire_Points',
                                        'Wilderness_Area_1', 'Wilderness_Area_2',
                                        'Wilderness_Area_3', 'Wilderness_Area_4']
    old_to_new_name_mapping = dict(zip(first_fourteen_old_feature_names, first_fourteen_new_feature_names))
    df_covtype.rename(columns=old_to_new_name_mapping, inplace=True)
    # Second step: rename the 40 soil type columns
    soil_type_old_feature_names = df_covtype.columns[np.arange(14,54)]
    soil_type_new_feature_names = ['Soil_Type_' + str(i) for i in np.arange(1,41)]
    old_to_new_name_mapping = dict(zip(soil_type_old_feature_names, soil_type_new_feature_names))
    df_covtype.rename(columns=old_to_new_name_mapping, inplace=True)
    # Last step: rename the last feature (cover type)
    df_covtype.rename(columns={54: 'Cover_Type'}, inplace=True)
    return df_covtype

In [None]:
# Features names are numeric, let's rename each one of them
df_covtype = features_renaming(df_covtype)

In [None]:
# Check the result
df_covtype.head()

In [None]:
# Downcast all features to reduce the overall dataframe dimension
# This step is not needed but it keeps the RAM usage low

# Get the columns header onto a list
df_covtype_headers = df_covtype.columns.values.tolist()
# A list containing the new dtype for each column of interest
features_new_dtype_list = list()
# Iterate through the headers list to assign the new dtype
# for each column
for i in range(len(df_covtype_headers)):
    if i == 3 or i == 4:
        # Horizontal_Distance_To_Hydrology and Vertical_Distance_To_Hydrology
        # could fit in float16
        # but float32 works better with mean and std calculations
        features_new_dtype_list.append('float32')
    elif i < 10:
        # First ten features can have int16
        features_new_dtype_list.append('int16')
    else:
        # the remaining ones are binary integers, so uint8
        # is enough
        features_new_dtype_list.append('uint8')
# A dictionary whose keys are the dataframe columns and values
# are the new dtype ( namely, {'Elevation' : 'int16', ...} )
features_new_dtype_mapping = dict(zip(df_covtype_headers,
                                      features_new_dtype_list))
# Perform the downcasting using the dictionary just created
df_covtype = df_covtype.astype(dtype=features_new_dtype_mapping)

In [None]:
# Check the result
df_covtype.info()

In [None]:
# Check the number of samples for each class value
df_covtype.Cover_Type.value_counts()

In [None]:
# Let's copy the first 15120 rows onto a new df and
# perform some Exploratory Data Analysis (EDA)
train = df_covtype[:15120].copy()

In [None]:
# Check if stratified sampling has already been done for the training set (it has)
train.Cover_Type.value_counts()

In [None]:
def elevation_multiple_covtype_distplot(elevations_df, covtype_mapping):
    '''Plot an Elevation's histogram for each Cover Type on the same figure'''
    # Color palette for distplot
    custom_palette = ['#F97EDB', '#49C0EA', '#B49EFC', '#CDAB40',
                      '#F69089', '#76BF3F', '#4CC9A6']
    # Set distplot background
    sns.set_style('darkgrid')
    # Custom bins range for evenly spaced hists
    bins = range(1800,4000,60)
    # Iterate through the dictionary to plot a histogram for each cover type
    for covtype_id, covtype_name in covtype_mapping.items():
        # covtype_id goes from 1 to 7
        # thus subtract 1 for indexing custom_palette
        palette_idx = covtype_id - 1        
        # Create a group for each Cover_Type and return a df satisfying the condition
        # on Cover_Type column
        by_one_covtype = elevations_df.groupby('Cover_Type') \
                                      .apply(lambda x: x[ x['Cover_Type'] == covtype_id ])
        # Plot one Elevation histogram for one group
        ax = sns.distplot(by_one_covtype.Elevation,
                          bins=bins,
                          color=custom_palette[palette_idx], label=covtype_name,
                          hist_kws=dict(alpha=0.8, edgecolor="none"),
                          kde=False)

    # Legend position to upper right
    plt.legend(loc='right', bbox_to_anchor=(1.2, 0.8), ncol=1)
    # Apply proper labeling to the axes
    ax.set(xlabel='Elevation (meters)', ylabel='Count')
    # Avoid cutting off the legend from the figure
    plt.tight_layout()
    # Show the figure (can be omitted in Jupyter Notebooks)
    plt.show()

In [None]:
# Plot an Elevation's histogram for each Cover Type
# to check for possible class separation

# Slice by two columns: Elevation and Cover_Type
elevations = train.loc[:, ['Elevation', 'Cover_Type']]
# Dictionary for mapping each integer target label to its string value
covtype_label_name_dict = {1: 'Spruce/Fir',
                           2: 'Lodgepole Pine',
                           3: 'Ponderosa Pine',
                           4: 'Cottonwood/Willow',
                           5: 'Aspen',
                           6: 'Douglas-fir',
                           7: 'Krummholz'}
# It is clear that classes
# 4 (Willow), 5 (Aspen) and 7 (Krummholz) are easily separable
elevation_multiple_covtype_distplot(elevations, covtype_label_name_dict)

In [None]:
# Don't need elevations anymore
print('Dereferencing elevations')
del elevations

In [None]:
# Since the Elevation looks like a promising feature for Cover Type prediction
# let's check the correlation matrix to see which continuos feature
# depends on the Elevation
# Note that we could have used Chi-Squared Test to check for features
# dependency on Cover Type but it is out of scope for this project

# The features we are interested in for the correlation matrix
features_to_check = ['Elevation', 'Aspect',
                     'Slope',
                     'Horizontal_Distance_To_Hydrology',
                     'Vertical_Distance_To_Hydrology',
                     'Horizontal_Distance_To_Roadways',
                     'Hillshade_9am', 'Hillshade_Noon',
                     'Hillshade_3pm',
                     'Horizontal_Distance_To_Fire_Points']
# Same features as above, just with some shorter names
# for visualization purposes
labels_to_plot = ['Elevation', 'Aspect',
                  'Slope', 'HD_Hydro',
                  'VD_Hydro', 'HD_Road',
                  'HS_9am', 'HS_Noon',
                  'HS_3pm', 'HD_Fire']
# Create a dictionary for mapping the labels like this
# {long_label: short_label}
shorter_labels = dict(zip(features_to_check, labels_to_plot))
# Make a copy of the training set because we want to rename
# some columns to fit them on the graph
corr_train = train[features_to_check].copy()
# Do the renaming feeding the mapping dictionary we created
corr_train.rename(columns=shorter_labels, inplace=True)
# Grab the AxesSubplots handle to modify labels padding and rotation
axes = scatter_matrix(corr_train, figsize=(12, 8))
n = len(corr_train.columns)
for x in range(n):
    for y in range(n):
        # for all the axes on the graph..
        ax = axes[x,y]
        # rotate the y-axis labels by 0 (horizontally)..
        ax.yaxis.label.set_rotation(0)
        # add some padding between the labels and their subgraph..
        ax.xaxis.labelpad = 0
        ax.yaxis.labelpad = 20
        # and hide axes value ranges
        ax.set_yticklabels([])
        ax.set_xticklabels([])

In [None]:
# Let's write down the most correlated features:
# - 'HD_Hydro' and 'VD_Hydro'
# - 'HS_Noon' and 'HS_3pm'
# Plot the same scatter graph zooming on these
corr_train.plot(kind='scatter', x='HD_Hydro', y='VD_Hydro')

In [None]:
# Looks like there are a lot of HS_3pm equal to zero.
# It could help to impute those with the median
corr_train.plot(kind='scatter', x='HS_Noon', y='HS_3pm')

In [None]:
# Don't need corr_train df anymore
# Hopefully the garbage collector will clean it up
print('Dereferencing corr_train')
del corr_train

In [None]:
# Count Hillshade_3pm zeros on the training set
(train.Hillshade_3pm == 0).astype(int).sum(axis=0)

In [None]:
# Imputing Hillshade_3pm zeros with the median
simp = SimpleImputer(missing_values=0, strategy='median')
# fit_transform requires X as a numpy array of shape [n_samples, n_features]
# thus the dataframe column is casted to a numpy array and reshaped
train.Hillshade_3pm = simp.fit_transform(train.Hillshade_3pm.values.reshape(-1,1))
# The imputer upcast the df column to float64, we don't need that
train.Hillshade_3pm = train.Hillshade_3pm.astype(np.float32)
# Count the zeros again to check the result
(train.Hillshade_3pm == 0).astype(int).sum(axis=0)

In [None]:
# Drop the training set from df_covtype
df_covtype = df_covtype.iloc[15120:]
# Append the imputed training set to df_covtype
df_covtype = pd.concat([train, df_covtype])

In [None]:
# Don't need train df anymore
del train

In [None]:
# Let's compute the Euclidean distance to Hydrology
hv_distances_labels = ['Horizontal_Distance_To_Hydrology',
                       'Vertical_Distance_To_Hydrology']
hv_dist_hydro_arr = df_covtype[hv_distances_labels].values
# Perform the Euclidean distance with respect to the origin
# (i.e (0;0) where the water is located)
euc_distance_to_hydro = np.linalg.norm(hv_dist_hydro_arr, axis=1)
# Add the new feature 'Distance_To_Hydrology' to the training set
# just after the 'Slope' feature at index 2
# rounding each distance to four decimal places
df_covtype.insert(3,
                  'Distance_To_Hydrology',
                  np.around(euc_distance_to_hydro, decimals=4))
# Drop the horizontal and vertical distance
df_covtype.drop(columns=hv_distances_labels, inplace=True)

In [None]:
# Check the result
df_covtype.head(2)

In [None]:
# Perform one hot encoding via get_dummies,
# then drop the integer target label, leaving the one hot encoded labels only
one_hot_covtype = pd.get_dummies(df_covtype.Cover_Type, prefix='CovT')
df_covtype.drop(columns='Cover_Type', inplace=True)
df_covtype_ohe = df_covtype.join(one_hot_covtype)

In [None]:
# Check the resulting shape
df_covtype_ohe.shape

In [None]:
# Don't need df_covtype anymore
print('Dereferencing df_covtype')
del df_covtype

In [None]:
# Drop Aspect and Slope since working with angles is tricky
# when computing the mean
df_covtype_ohe.drop(labels=['Aspect', 'Slope'], axis=1, inplace=True)

In [None]:
def train_test_split(df_covtype_ohe):
    '''
    Split the one hot encoded dataset onto training set and test set
    according to UCI's repository guidelines
    '''
    # First 15120 rows for the training set
    X_train = df_covtype_ohe[:15120].copy()
    # The last seven colums are the targets
    X_train, y_train = X_train.iloc[:, :51], X_train.iloc[:, 51:]
    # The remaining rows are for the test set
    X_test = df_covtype_ohe[15121:].copy()
    X_test, y_test = X_test.iloc[:, :51], X_test.iloc[:, 51:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(df_covtype_ohe)

In [None]:
# Check shapes consistency
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, ' \
      f'y_train: {y_train.shape}, y_test: {y_test.shape}')

In [None]:
# Let's standardize the training set and test set.
# Note that we use the training set ONLY to calculate the mean and standard deviation
# then normalize the training set 
# and finally use the (training) mean and standard deviation to normalize the test set.
# This ensures no data leakage.

def train_test_normalize(X_train, X_test):
    '''
    Perform standardization on the training set and transforms the
    test set accordingly
    '''
    # The numerical columns we want to normalize
    numerical_columns = ['Elevation',
                         'Distance_To_Hydrology',
                         'Horizontal_Distance_To_Roadways',
                         'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                         'Horizontal_Distance_To_Fire_Points']
    # Calculate the mean and standard deviation of the training set
    X_train_num_cols_mean = X_train[numerical_columns].mean()
    X_train_num_cols_std = X_train[numerical_columns].std()
    # Perform standardization over the numerical columns of the training set
    X_train_std = (X_train[numerical_columns] - X_train_num_cols_mean) / X_train_num_cols_std
    # Concatenate side-by-side the normalized training set and the one-hot encoded features
    # Note that we index X_train dataframe by the (set) difference of the overall features
    # minus the numerical ones
    ohe_features = X_train.columns.difference(other=numerical_columns, sort=False)
    X_train_std = pd.concat([X_train_std, X_train[ohe_features]], axis=1)
    # Perform standardization over the numerical columns of the test set, using the mean and std
    # of the training set as discussed earlier
    X_test_std = (X_test[numerical_columns] - X_train_num_cols_mean) / X_train_num_cols_std
    # Concatenate side-by-side the normalized test set and the one-hot encoded features
    X_test_std = pd.concat([X_test_std, X_test[ohe_features]], axis=1)
    return X_train_std, X_test_std

X_train_std, X_test_std = train_test_normalize(X_train, X_test)

In [None]:
# Generate one validation set from the normalized training set.
# Since the training set contains 2160 samples for each class,
# let's split according to 75% training set / 25% validation set.
# This yields 1620 samples for the new training set and 540 for the validation set.
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)
for train_index, valid_index in sss.split(X_train_std, y_train):
    X_train_std_minus_validation = X_train_std.iloc[train_index]
    X_validation = X_train_std.iloc[valid_index]
    y_train_minus_validation = y_train.iloc[train_index]
    y_validation = y_train.iloc[valid_index]

In [None]:
# Convert these df to numpy arrays for later use
X_train_std_minus_validation = X_train_std_minus_validation.to_numpy()
X_validation = X_validation.to_numpy()
y_train_minus_validation = y_train_minus_validation.to_numpy()
y_validation = y_validation.to_numpy()

In [None]:
# Check shapes consistency
print(f'X_train_std w/o validation set: {X_train_std_minus_validation.shape} \n' \
      f'X_validation: {X_validation.shape} \n' \
      f'y_train w/o validation set: {y_train_minus_validation.shape} \n' \
      f'y_validation: {y_validation.shape}')

In [None]:
# Get the number of features for the network input layer
n_features = X_train_std_minus_validation.shape[1]
# Build the network using Keras API
classifier_net = keras.Sequential()
# Input layer has n_features size while the hidden layer has 120 fully connected units
# using ReLu activation function
classifier_net.add(keras.layers.Dense(120, activation='relu', input_dim=n_features))
# Output layer has 7 fully connected units using softmax activation function
classifier_net.add(keras.layers.Dense(7, activation='softmax'))
# Applies Stochastic Gradient Descent, Categorical Crossentropy loss func
classifier_net.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
# Check the model architecture
classifier_net.summary()

In [None]:
# Each bach is 10% (e.g. 1134) of the training set size (e.g. 11340)
# Train the network and test it against the validation set
clf_output = classifier_net.fit(X_train_std_minus_validation,
                                y_train_minus_validation,
                                epochs=100,
                                batch_size=1134,
                                shuffle=True,
                                validation_data=(X_validation, y_validation),
                                verbose=1)

In [None]:
# Verify classifier on the test set
# I'm using the validation here just for code testing purposes
# TODO: Remember to SUBSTITUTE X_validation with X_test
y_pred_validation = classifier_net.predict(X_validation)
# Reverse one-hot encoding (i.e going back to categorical variables)
# for the predicted targets
y_pred_validation_cat = np.argmax(y_pred_validation, axis=1)
# Do the same for the true targets
y_validation_cat = np.argmax(y_validation, axis=1)

In [None]:
# Check the confusion matrix
confusion_matrix(y_validation_cat, y_pred_validation_cat)

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

torch.manual_seed(1)

class CovtypeDataset(TensorDataset):

    def __init__(self, *dataframes):
        tensors = (self._df_to_tensor(df) for df in dataframes)
        super(CovtypeDataset, self).__init__(*tensors)

    def _df_to_tensor(self, df):
        if isinstance(df, pd.Series):
            df = df.to_frame()
        return torch.from_numpy(df.values).float()

train_data = CovtypeDataset(X_train_std, y_train)
test_data = CovtypeDataset(X_test_std, y_test)

batch_size = 15120
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
print('# training samples:', len(train_data))
print('# batches:', len(train_loader))

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Classifier(nn.Module):

    def __init__(self, n_features, n_hidden=120, n_output=7, p_dropout=0.2):
        super(Classifier, self).__init__()
        self.hidden = nn.Linear(n_features, n_hidden)
        self.output = nn.Linear(n_hidden, n_output)
#        self.network = nn.Sequential(
#            # Input layer
#            nn.Linear(n_features, n_hidden),
#            # First hidden layer
#            nn.Sigmoid,
#            nn.Dropout(p_dropout),
#            # Output layer
#            nn.Sigmoid,
#            nn.Dropout(p_dropout),
#        )

    def forward(self, x):
        x = self.hidden(x)
        x = F.sigmoid(x)
        x = self.output(x)
        return x

clf = Classifier(n_features=X_train_std.shape[1])
clf_criterion = nn.MSELoss()
clf_optimizer = optim.SGD(clf.parameters(), lr=0.05, momentum=0.5)

# def pretrain_classifier(clf, data_loader, optimizer, criterion):
#     for x, y in data_loader:
#         clf.zero_grad()
#         p_y = clf(x)
#         loss = criterion(p_y, y)
#         loss.backward()
#         optimizer.step()
#     return clf


N_CLF_EPOCHS = 2

for epoch in range(N_CLF_EPOCHS):
    # clf = pretrain_classifier(clf, train_loader, clf_optimizer, clf_criterion)
    for x, y in train_loader:
        predicted_y = clf(x)
        loss = clf_criterion(predicted_y, y)
        clf_optimizer.zero_grad()
        loss.backward()
        clf_optimizer.step()
        #Accuracy
        predicted_y = (predicted_y>0.5).float()
        correct = (predicted_y == y).float().sum()
        print("Epoch {}/{}, Accuracy: {:.3f}".format(epoch+1,
                                                                   N_CLF_EPOCHS,
                                                                   #loss.data[0],
                                                                   100. * correct/predicted_y.shape[0]))

#with torch.no_grad():
#    pre_clf_test = clf(test_data.tensors[0])

# Sono le probabilità per il primo sample del test set?
#pre_clf_test.data.numpy()[0]

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.1, random_state=1, solver='liblinear', multi_class='ovr')
lr.fit(X_train_std, y_train)

print('Logistic Regression, training accuracy: %.2f%%' % (100 * lr.score(X_train_std, y_train)))