### Linear Discriminant Analysis (LDA)
https://sebastianraschka.com/Articles/2014_python_lda.html


Logistic regression was introduced for classification. Unfortunately, like any model, it presents some flaws:
    - When classes are well separated, parameters estimate from logistic regression tend to be unstable
    - When the data set is small, logistic regression is also unstable
    - Not the best to predict more than two classes
That’s where linear discriminant analysis (LDA) comes in handy. It is more stable than logistic regression and widely used to predict more than two classes.

### Importing the libraries


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
df = pd.read_csv('Wine.csv')

### Review data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
Alcohol                 178 non-null float64
Malic_Acid              178 non-null float64
Ash                     178 non-null float64
Ash_Alcanity            178 non-null float64
Magnesium               178 non-null int64
Total_Phenols           178 non-null float64
Flavanoids              178 non-null float64
Nonflavanoid_Phenols    178 non-null float64
Proanthocyanins         178 non-null float64
Color_Intensity         178 non-null float64
Hue                     178 non-null float64
OD280                   178 non-null float64
Proline                 178 non-null int64
Customer_Segment        178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [4]:
df.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [4]:
df.tail()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740,3
174,13.4,3.91,2.48,23.0,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840,3
177,14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560,3


### Split data into the independent vs dependent variables

In [5]:
X = df.iloc[:,[0,13]].values
y = df.iloc[:,13].values

### Split data into train and test sets

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 0)

### Feature Scaling

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
sc_X = StandardScaler()

In [10]:
X_train = sc_X.fit_transform(X_train)

In [11]:
X_test = sc_X.fit_transform(X_test)

In [12]:
X_train

array([[ 0.87668336,  1.30503589],
       [-0.36659076,  0.02700074],
       [-1.69689407,  0.02700074],
       [ 0.51613387,  1.30503589],
       [ 0.64046128, -1.2510344 ],
       [ 0.92641433, -1.2510344 ],
       [-0.8639004 ,  0.02700074],
       [-0.47848543,  0.02700074],
       [-1.95798163,  0.02700074],
       [ 0.81451966, -1.2510344 ],
       [-0.47848543,  1.30503589],
       [-1.27418086,  0.02700074],
       [-0.91363137,  1.30503589],
       [ 1.63508058, -1.2510344 ],
       [-0.13036867,  1.30503589],
       [ 0.62802854,  1.30503589],
       [ 0.71505773, -1.2510344 ],
       [ 1.68481154,  1.30503589],
       [ 0.90154885, -1.2510344 ],
       [-0.95092959,  0.02700074],
       [ 0.35450823,  1.30503589],
       [ 0.21774808,  1.30503589],
       [ 1.08803996,  0.02700074],
       [-0.1676669 ,  1.30503589],
       [ 1.06317448,  1.30503589],
       [ 0.39180646, -1.2510344 ],
       [ 0.08098793,  0.02700074],
       [-0.5903801 ,  1.30503589],
       [-0.8639004 ,

### Applying LDA

In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [14]:
lda = LDA(n_components = 2)
#n_components : nb of extracted feature we want to get

In [15]:
X_train = lda.fit_transform(X_train,y_train)
#we need y_train since LCA is supervised model

In [16]:
X_test = lda.transform(X_test)

### Fitting Logistic Regression to the Tranining set


In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
classifier = LogisticRegression(random_state=0)

In [19]:
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Predicting the Test set results

In [20]:
y_pred = classifier.predict(X_test)

### Making the confusion matrix

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
cm = confusion_matrix(y_test,y_pred)

In [23]:
cm

array([[14,  0,  0],
       [ 0,  0, 16],
       [ 0,  0,  6]])

### Visualising the  Training set results

In [24]:
from matplotlib.colors import ListedColormap

In [25]:
X_set, y_set = X_train, y_train

In [28]:
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, stop = X_set[:,0].max()+1, step = 0.01),\
                        np.arange(start = X_set[:,1].min()-1, stop = X_set[:,1].max()+1, step = 0.01))

MemoryError: 

In [None]:
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()] ).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green','blue')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())

In [None]:
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()] ).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green','blue')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j,1],
                c = ListedColormap(('red','green','blue'))(i),label =j  )
plt.title('Logistic Regression (Training set)')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend()
plt.show()

### Visualising the Test set results

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, stop = X_set[:,0].max()+1, step = 0.01),\
                        np.arange(start = X_set[:,1].min()-1, stop = X_set[:,1].max()+1, step = 0.01))

In [None]:
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()] ).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green','blue')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())

In [None]:
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()] ).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j,1],
                c = ListedColormap(('red','green','blue'))(i),label =j  )
plt.title('Logistic Regression (Training set)')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend()
plt.show()