In [1]:
import numpy as np
import pandas as pd

In [2]:
# read date from csv file
data = pd.read_csv('fatigue_data.csv', index_col='Sl. No.')

# set X as columns C, Ni, Cr and Mo (No.16-19)
Xf = data.drop(data.columns[16:20], axis=1) 
# set y as 'Fatigue' column (No.17)
yf = data['Fatigue']

Xf.shape, yf.shape

((437, 16), (437,))

In [3]:
# read date from xlsx file
data = pd.read_excel('kappa.xlsx')

Xk = data.drop(data.columns[0:3], axis=1).drop(data.columns[-1], axis=1)
yk = data[data.columns[-1]]

Xk.shape, yk.shape

((320, 36), (320,))

In [4]:
# Variance Threshold
from sklearn.feature_selection import VarianceThreshold

In [5]:
thresholder = VarianceThreshold(threshold=100)
features_high_variance = thresholder.fit_transform(Xf)

features_high_variance

array([[885.,  30.,   0., ...,  30.,  30.,   0.],
       [885.,  30.,   0., ...,  30.,  30.,   0.],
       [885.,  30.,   0., ...,  30.,  30.,   0.],
       ...,
       [930.,  30.,   0., ...,  60., 200., 120.],
       [930.,  30.,   0., ...,  60., 200., 120.],
       [930.,  30.,   0., ...,  60., 200., 120.]])

In [6]:
thresholder.fit(Xf).variances_

array([6.85500526e+02, 7.82410122e+04, 1.05105017e+02, 6.13474229e+01,
       7.91977756e+04, 1.60730141e+04, 7.11945768e+04, 2.45928321e+02,
       3.76245359e+02, 2.68678309e+04, 4.60233860e+02, 6.50114102e+01,
       9.26471103e-03, 7.25902728e-01, 1.69165923e-01, 7.74801250e-03])

In [7]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
Xf_std = scaler.fit_transform(Xf)

selector = VarianceThreshold()

# output 1 means that feature noramlization is scuccessful
selector.fit(Xf_std).variances_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [9]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFECV
from sklearn import linear_model

In [10]:
ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols,step=1,scoring="neg_mean_squared_error")

rfecv.fit(Xf, yf)
rfecv.transform(Xf)

array([[0.00e+00, 2.60e-01, 1.00e-02, 2.00e-02, 0.00e+00],
       [0.00e+00, 2.50e-01, 8.00e-02, 1.20e-01, 0.00e+00],
       [0.00e+00, 2.60e-01, 2.00e-02, 3.00e-02, 0.00e+00],
       ...,
       [1.20e+02, 2.10e-01, 6.00e-02, 1.17e+00, 1.70e-01],
       [1.20e+02, 2.10e-01, 2.00e-02, 9.10e-01, 1.50e-01],
       [1.20e+02, 1.80e-01, 7.00e-02, 1.08e+00, 1.50e-01]])

In [11]:
rfecv.n_features_, rfecv.support_, rfecv.ranking_

(5,
 array([False, False, False, False, False, False, False, False, False,
        False,  True, False,  True,  True,  True,  True]),
 array([ 7,  6,  3,  5,  2, 12,  9, 11, 10,  8,  1,  4,  1,  1,  1,  1]))

In [12]:
# Principal Component Analysis
from sklearn.decomposition import PCA

In [13]:
pca = PCA(n_components=0.99, whiten=True)
Xf_pca = pca.fit_transform(Xf)

print("Original number of features:", Xf.shape[1])
print("Reduced number of features:", Xf_pca.shape[1])

Original number of features: 16
Reduced number of features: 3


In [14]:
# PCA Kernel 
from sklearn.decomposition import PCA, KernelPCA

In [15]:
kpca = KernelPCA(kernel="rbf",gamma=15,n_components=1)
Xf_kpca = kpca.fit_transform(Xf)

print("Original number of features:",Xf.shape[1])
print("Reduced number of features:",Xf_kpca.shape[1])

Original number of features: 16
Reduced number of features: 1


In [16]:
# Linear Discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [20]:
lda = LinearDiscriminantAnalysis(n_components=5)
Xf_lda = lda.fit(Xf,yf).transform(Xf)

print("Original number of features:", Xf.shape[1])
print("Reduced number of features:", Xf_lda.shape[1])

Original number of features: 16
Reduced number of features: 5


In [21]:
# Non-negative matrix factorization
from sklearn.decomposition import NMF

In [23]:
nmf = NMF(n_components=10,random_state=1)
Xf_nmf = nmf.fit_transform(Xf)

print("Original number of features:", Xf.shape[1])
print("Reduced number of features:", Xf_nmf.shape[1])

Original number of features: 16
Reduced number of features: 10
