In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression

## Dimension Reduction by Feature Extraction 
 
* Transforms the features into a lower dimensional feature space.
    - Information about the dataset is compressed.
    - Feature selection keeps the same features, feature extraction creates new features
    - The transformed features then can be used in regression and classification tasks

* Methods
    - Linear Discriminant Analysis (Supervised)
    - Principal Component Analysis (Unsupervised)


#### Wine data

* Class wine into one of three classes 
* 13 numerical features

In [None]:
wine = pd.read_csv('Wine.csv')
wine.tail()

In [None]:
X = wine.iloc[:, 0:13].values
y = wine.iloc[:, 13].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Linear Discriminant Analysis (LDA)

* LDA is used for classification and feature extraction
    - A **supervised** dimensionality reduction technique for maximizing class separability
        - Uses information provided by the class labels to aid discriminating
    - Extracted feature vectors indicate the direction of maximimum class variability and separation

#### Assumptions

* The classes have Normal distributions  
* The classes have identical covariance matrices
* The features are statistically independent of each other.
* LDA for dimension reduction can work fairly well even if these assumptions are violated.
    - LDA classification is also fairly robust to the distribution of the classes

![](LDA.png)
$$\text{Figure 1. Two Classes}$$

* LDA computes the directions (“linear discriminants”) that will represent the axes that that maximize the separation between multiple classes.

* Projecting the data onto the X-axis maximizes the class separation.
    - LD1 linear discriminant on the x-axis (LD 1) would do a good job of separating the two normal distributed classes. 
* The linear discriminant LD 2 captures a lot of the variance in the dataset but would fail as a good linear discriminant (i.e. projection) since it does not capture any of the information that discriminates the classes

### LDA Algorithm

* LDA computes means and scatter matricies on standardize data. The eigenvalues and eigenvectors of the combined matrices are used to project the data onto the lower dimensional space.

#### 1) Standardize the data

In [None]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

#### 2) Create mean of features grouped by class

*  LDA takes class label information into account, which is represented in the form of the mean vectors

In [None]:
MVs = [np.mean(X_train_std[y_train == label], axis=0) for label in [1,2,3]]
MVs

#### 3) Create the Within Class Scatter Matrix

* Individual Class scaled scatter matrix is just the covariance matrix $S_i$
$$S_w = \sum_CS_c$$

In [None]:
d = X_train.shape[1] # number of features
S_W = np.zeros((d, d))
for label in range(1, 4):
    class_scatter = np.cov(X_train_std[y_train == label].T)
    S_W += class_scatter

S_W.shape

#### 4) Create the Between Class Scatter Matrix

$$S_B = \sum_{i = 1}^Cn_i(m_i - m)(m_i - m)^T$$

$n_i$ is the number of observations in class i  
$m_i$ is the Class mean  
m is the overall mean


In [None]:
overall_mean = np.mean(X_train_std, axis=0)
print(overall_mean.shape)
S_B = np.zeros((d, d))
for i, mv in enumerate(MVs):
    n = X_train[y_train == i + 1, :].shape[0]
    mv = mv.reshape(d, 1)  # make column vector
    overall_mean = overall_mean.reshape(d, 1)  # make column vector
    S_B += n * (mv - overall_mean).dot((mv - overall_mean).T) # (class mean - overall Mean) squared
S_B.shape


#### 5) Compute the eigenvectors and corresponding eigenvalues of the matrix  $S_W^{-1}S_B$ (i.e. $S_B/S_W$)

In [None]:
eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))

#### 6) Sort the eigenvalues by decreasing order to rank the corresponding eigenvectors.

In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
               for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)
eigen_vals = list(map(lambda x: x[0],eigen_pairs))
eigen_vecs = list(map(lambda x: x[1],eigen_pairs))
list(eigen_vals)

#### 7) Choose the k eigenvectors that correspond to the k largest eigenvalues.

* Construct W:  a d × k transformation matrix  
* The eigenvectors are the columns W

In [None]:
W = np.zeros((13,2))
W = np.hstack((np.array(eigen_vecs[0]).reshape(-1,1).real,
               np.array(eigen_vecs[1]).reshape(-1,1).real))
W

#### 8) Project the data onto the new feature subspace using the transformation matrix W, 142x13 X 13x2 = 142x2
$$ X_{new} = X_{train} \cdot{W}$$

In [None]:
X_train_lda = X_train_std.dot(W)
X_train_lda.shape

#### Plot data in new feature space

In [None]:
colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']

for l, c, m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_lda[y_train == l, 0],
                X_train_lda[y_train == l, 1] * (-1),
                c=c, label=l, marker=m)

plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower right')
plt.tight_layout()


### LDA for Dimension Reduction in sklearn

#### Fit and transform the Training data to 2-dimensions

In [None]:

model_lda = LDA(n_components = 2) # Defaults to full number of dimensions so need to set to the reduced space
X_train_lda2 = model_lda.fit_transform(X_train_std, y_train) # Fit and transform to use in Logistic Regression
X_test_lda2 = model_lda.transform(X_test_std)
X_train_lda2.shape,X_test_lda2.shape

In [None]:
colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']

for l, c, m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_lda2[y_train == l, 0],
                X_train_lda2[y_train == l, 1] ,
                c=c, label=l, marker=m)

plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower right')
plt.tight_layout()

#### Fit a Logistic Regression model using all thirteen prdictors

In [None]:
# Fitting Logistic Regression to the Training set

model_lr = LogisticRegression(random_state = 0)
model_lr.fit(X_train_std, y_train)

# Predicting the Test set results
y_pred = model_lr.predict(X_test_std)

# Making the Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f'\nAccuracy: {np.trace(cm)/np.sum(cm)}')


#### Do a Logistic Regression model with the reduced Training data

In [None]:
# Fitting Logistic Regression to reduced feature space

model_lr2 = LogisticRegression(random_state = 0)
model_lr2.fit(X_train_lda2, y_train)

# Predicting the Test set results
y_pred = model_lr2.predict(X_test_lda2)

# Making the Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f'\nAccuracy: {np.trace(cm)/np.sum(cm)}')