In [17]:
# Import required libraries
import numpy as np
import sklearn
import pandas as pd
%matplotlib inline

# Import data

In [13]:
# Open CSV file of the data
data = pd.read_csv('normalized_trainingdata.csv', index_col=0)

In [14]:
# Explore data
data.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
392,903507,-0.239005,0.438392,-0.263563,-0.233494,1.227238,-0.550261,-0.745273,-0.592632,2.638762,...,-0.145558,-0.428577,-0.34935,-0.119256,-0.972078,-0.969862,-0.794466,0.510919,-1.092594,M
150,871001501,-0.429974,-0.467354,-0.509636,-0.411892,-1.528609,-1.388717,-0.996507,-0.854272,-1.266571,...,-0.867471,-0.794639,-0.676082,-2.14909,-1.462664,-1.32509,-1.364965,-1.685291,-1.373434,B
117,864729,-0.365432,1.087851,-0.370005,-0.329087,-1.273222,-0.562195,-0.429223,-0.760788,-1.106241,...,0.932695,-0.434338,-0.327802,-0.425402,0.629745,0.632887,0.07484,-0.093767,0.636523,M
38,855133,-0.962562,-0.306445,-0.934186,-0.974342,0.030707,-0.124418,-0.453006,-0.553292,-0.707961,...,0.244852,-1.029215,-1.107727,0.448613,0.022443,-0.241771,-0.226426,-1.003151,-0.074812,M
319,894335,-0.579481,1.836321,-0.605254,-0.57164,-0.880673,-0.777957,-0.618262,-0.786001,-0.723067,...,1.724364,-0.656453,-0.648614,-0.508829,-0.425796,-0.407535,-0.662139,-0.731524,-0.120822,B


In [4]:
# Check data type
data['fractal_dimension_worst'].dtype

dtype('float64')

In [5]:
# Import LDA from sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import model_selection, metrics, grid_search

# Info: http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html#sklearn.discriminant_analysis.LinearDiscriminantAnalysis
# First paper on using LDA in biology: http://www.jstor.org/stable/2983775?seq=1#page_scan_tab_contents



In [6]:
# Split X and Y data
y_train = data['diagnosis']
y_train=y_train.map({'B': 0,'M': 1})
x_train = data.drop(['id','diagnosis'], axis=1)

x_train.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
392,-0.239005,0.438392,-0.263563,-0.233494,1.227238,-0.550261,-0.745273,-0.592632,2.638762,-0.284347,...,-0.368913,-0.145558,-0.428577,-0.34935,-0.119256,-0.972078,-0.969862,-0.794466,0.510919,-1.092594
150,-0.429974,-0.467354,-0.509636,-0.411892,-1.528609,-1.388717,-0.996507,-0.854272,-1.266571,-1.025898,...,-0.707782,-0.867471,-0.794639,-0.676082,-2.14909,-1.462664,-1.32509,-1.364965,-1.685291,-1.373434
117,-0.365432,1.087851,-0.370005,-0.329087,-1.273222,-0.562195,-0.429223,-0.760788,-1.106241,-0.380006,...,-0.358623,0.932695,-0.434338,-0.327802,-0.425402,0.629745,0.632887,0.07484,-0.093767,0.636523
38,-0.962562,-0.306445,-0.934186,-0.974342,0.030707,-0.124418,-0.453006,-0.553292,-0.707961,0.166392,...,-1.085057,0.244852,-1.029215,-1.107727,0.448613,0.022443,-0.241771,-0.226426,-1.003151,-0.074812
319,-0.579481,1.836321,-0.605254,-0.57164,-0.880673,-0.777957,-0.618262,-0.786001,-0.723067,-0.260442,...,-0.654762,1.724364,-0.656453,-0.648614,-0.508829,-0.425796,-0.407535,-0.662139,-0.731524,-0.120822


# LDA

In [7]:
# Define model
lda = LinearDiscriminantAnalysis() #default solver is single value decomposition, no shrinkage

model = lda.fit(x_train, y_train)
y_train_pred = model.predict(x_train)
print(model.priors_)
print(model.means_)
print(model.coef_)

[ 0.63076923  0.36923077]
[[  3.62255072e-02  -2.04853207e-02   3.48593783e-02   3.61335619e-02
   -2.19634107e-02  -1.36864825e-02   1.28616243e-02   1.48732673e-02
   -5.54122889e-02  -5.04830594e-02   2.83661432e-03   5.24653902e-05
    3.69335745e-03   5.99479083e-03  -2.30683874e-02  -2.54353055e-02
    1.36775101e-02   1.81753751e-02   5.59066984e-03  -3.06062163e-02
    2.49865730e-02  -2.64718329e-02   2.55071097e-02   2.50387316e-02
   -3.65706922e-02  -1.27275477e-02   4.41331688e-03   3.05506953e-03
   -3.43907933e-02  -4.72866854e-02]
 [ -6.18852414e-02   3.49957562e-02  -5.95514380e-02  -6.17281682e-02
    3.75208266e-02   2.33810744e-02  -2.19719414e-02  -2.54084983e-02
    9.46626602e-02   8.62418932e-02  -4.84588280e-03  -8.96284226e-05
   -6.30948562e-03  -1.02411010e-02   3.94084951e-02   4.34519802e-02
   -2.33657464e-02  -3.10495990e-02  -9.55072758e-03   5.22856195e-02
   -4.26853954e-02   4.52227145e-02  -4.35746457e-02  -4.27744998e-02
    6.24749324e-02   2.1742

In [8]:
# Metric output
print('(Training) Accuracy:', metrics.accuracy_score(y_train, y_train_pred))
print('(Training) Recall:', metrics.recall_score(y_train, y_train_pred))
print('(Training) Precision:', metrics.precision_score(y_train, y_train_pred))

(Training) Accuracy: 0.654945054945
(Training) Recall: 0.160714285714
(Training) Precision: 0.627906976744


# PCDA 2 components

In [52]:
# Make PCDA - principcal component discriminant analysis which is more robust
from sklearn.decomposition import PCA

# Create new PCA model (Use 2 components)
pca = PCA(n_components = 2)

# Fit the scaled data to the model
pca_fitted = pca.fit_transform(x_train)

# Create a new dataframe to store PC's
pca_df = pd.DataFrame(data = pca_fitted, columns = ['PC1', 'PC2'])

# Print the explain variance calculated
print("PC1 explains: ", pca.explained_variance_ratio_[0] * 100, "%")
print("PC2 explains: ", pca.explained_variance_ratio_[1] * 100, "%")
pc1_axis = round(pca.explained_variance_ratio_[0] * 100, 1)
pc2_axis = round(pca.explained_variance_ratio_[1] * 100, 1)

PC1 explains:  44.3814656602 %
PC2 explains:  20.2515708257 %


In [51]:
# Perform cross validation on PCA to determine number of PC's


In [53]:
# Perform LDA on PC's - output is discriminant vector with length of number of PC's
model2 = lda.fit(pca_df, y_train)
y_train_pred2 = model2.predict(pca_df)
print(model.priors_)
print(model.means_)
print(model.coef_)

[ 0.63076923  0.36923077]
[[ 0.0090503  -0.11205425  0.0278526 ]
 [-0.01546093  0.19142601 -0.04758153]]
[[-0.00184431  0.05004297 -0.02691929]]


In [54]:
# Metric output on PCDA
print('(Training) Accuracy:', metrics.accuracy_score(y_train, y_train_pred2))
print('(Training) Recall:', metrics.recall_score(y_train, y_train_pred2))
print('(Training) Precision:', metrics.precision_score(y_train, y_train_pred2))

(Training) Accuracy: 0.628571428571
(Training) Recall: 0.0
(Training) Precision: 0.0


# PCDA 3 components

In [59]:
# 3 components
# Create new PCA model (Use 3 components)
pca3 = PCA(n_components = 3)

# Fit the scaled data to the model
pca_fitted3 = pca3.fit_transform(x_train)

# Create a new dataframe to store PC's
pca_df3 = pd.DataFrame(data = pca_fitted3, columns = ['PC1', 'PC2','PC3'])

# Print the explain variance calculated
print("PC1 explains: ", pca3.explained_variance_ratio_[0] * 100, "%")
print("PC2 explains: ", pca3.explained_variance_ratio_[1] * 100, "%")
print("PC3 explains: ", pca3.explained_variance_ratio_[2] * 100, "%")
pc1_axis = round(pca3.explained_variance_ratio_[0] * 100, 1)
pc2_axis = round(pca3.explained_variance_ratio_[1] * 100, 1)
pc3_axis = round(pca3.explained_variance_ratio_[1] * 100, 1)

PC1 explains:  44.3814656602 %
PC2 explains:  20.2515708257 %
PC3 explains:  9.35784036115 %


In [56]:
# 3 components:
# Perform LDA on PC's - output is discriminant vector with length of number of PC's
model3 = lda.fit(pca_df3, y_train)
y_train_pred3 = model3.predict(pca_df3)
print(model.priors_)
print(model.means_)
print(model.coef_)

# Metric output on PCDA
print('(Training) Accuracy:', metrics.accuracy_score(y_train, y_train_pred2))
print('(Training) Recall:', metrics.recall_score(y_train, y_train_pred2))
print('(Training) Precision:', metrics.precision_score(y_train, y_train_pred2))

[ 0.63076923  0.36923077]
[[ 0.0090503  -0.11205425  0.0278526 ]
 [-0.01546093  0.19142601 -0.04758153]]
[[-0.00184431  0.05004297 -0.02691929]]
(Training) Accuracy: 0.628571428571
(Training) Recall: 0.0
(Training) Precision: 0.0


# Test set

In [20]:
# Open CSV file of the test data
testdata = pd.read_csv('data_test_scaled.csv')

In [21]:
testdata.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,1.067251,0.691033,1.021472,1.089621,-1.47444,-0.266832,0.330185,0.226676,-0.142014,-1.343336,...,-0.051043,0.741802,0.834567,-1.998325,-0.303957,0.043295,-0.291618,-0.936538,-1.069808,M
1,0.52594,0.099907,0.682885,0.513707,1.893908,2.522961,2.444457,1.75866,1.939148,2.051476,...,-0.145347,0.488868,0.337698,1.091856,1.82973,2.119162,1.485055,2.199157,1.410614,M
2,-1.2029,-0.323548,-1.186101,-1.197627,0.341866,-0.529751,-0.786869,-0.743795,0.279156,0.152189,...,0.310919,-1.145374,-1.129665,0.422699,-0.750897,-0.834487,-0.709784,-0.013484,-0.364739,B
3,-0.169271,-0.036356,-0.125818,-0.15782,1.338696,0.449511,0.484346,0.595433,1.102045,0.820708,...,0.48785,0.004591,0.022092,2.141353,1.268913,1.300322,1.41981,1.654742,2.203137,M
4,-0.901884,-0.851327,-0.859601,-0.9219,0.341866,0.136061,-0.43586,-0.794269,0.177764,0.91043,...,-1.107341,-0.773737,-0.92654,0.103174,1.09118,0.491224,-0.479768,-0.559169,1.595744,B


In [23]:
# Split X and Y data
y_test = testdata['diagnosis']
y_test=y_test.map({'B': 0,'M': 1})
x_test = testdata.drop(['diagnosis'], axis=1)

x_test.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,1.067251,0.691033,1.021472,1.089621,-1.47444,-0.266832,0.330185,0.226676,-0.142014,-1.343336,...,0.780793,-0.051043,0.741802,0.834567,-1.998325,-0.303957,0.043295,-0.291618,-0.936538,-1.069808
1,0.52594,0.099907,0.682885,0.513707,1.893908,2.522961,2.444457,1.75866,1.939148,2.051476,...,0.339148,-0.145347,0.488868,0.337698,1.091856,1.82973,2.119162,1.485055,2.199157,1.410614
2,-1.2029,-0.323548,-1.186101,-1.197627,0.341866,-0.529751,-0.786869,-0.743795,0.279156,0.152189,...,-1.142028,0.310919,-1.145374,-1.129665,0.422699,-0.750897,-0.834487,-0.709784,-0.013484,-0.364739
3,-0.169271,-0.036356,-0.125818,-0.15782,1.338696,0.449511,0.484346,0.595433,1.102045,0.820708,...,-0.007988,0.48785,0.004591,0.022092,2.141353,1.268913,1.300322,1.41981,1.654742,2.203137
4,-0.901884,-0.851327,-0.859601,-0.9219,0.341866,0.136061,-0.43586,-0.794269,0.177764,0.91043,...,-0.918529,-1.107341,-0.773737,-0.92654,0.103174,1.09118,0.491224,-0.479768,-0.559169,1.595744


In [None]:
# Define model
