In [1]:
import numpy as np
import pandas as pd

In [4]:
# read MATERIALS FATIGUE data from csv file
dataf = pd.read_csv('fatigue_data.csv', index_col='Sl. No.')

# set X as columns C, Ni, Cr and Mo (No.16-19)
Xf = dataf.drop(dataf.columns[16:20], axis=1) 
# set y as 'Fatigue' column (No.17)
yf = dataf['Fatigue']


# read THERMAL CONDUCTANCE data from xlsx file
datak = pd.read_excel('kappa.xlsx')

Xk = datak.drop(datak.columns[0:3], axis=1).drop(datak.columns[-1], axis=1)
yk = datak[datak.columns[-1]]


# Output shapes
print("Materials Fatigues data has the shape:  {0} and {1}".format(Xf.shape, yf.shape))
print("Thermal Conductance data has the shape: {0} and {1}".format(Xk.shape, yk.shape))

Materials Fatigues data has the shape:  (437, 16) and (437,)
Thermal Conductance data has the shape: (320, 36) and (320,)


In [5]:
# Variance Threshold
from sklearn.feature_selection import VarianceThreshold

In [6]:
thresholder = VarianceThreshold(threshold=100)
print("VT result for Material Fatigue")
Xf_features_high_variance = thresholder.fit_transform(Xf)
print(Xf_features_high_variance.shape)
print('\n')


thresholder = VarianceThreshold(threshold=100)
print("VT result for Kappa")
Xk_features_high_variance = thresholder.fit_transform(Xk)
print(Xk_features_high_variance.shape)

VT result for Material Fatigue
(437, 10)


VT result for Kappa
(320, 12)


In [7]:
features=[[0,1,0],
          [0,1,1],
          [0,1,0],
          [0,1,1],
          [1,0,0]]

thresholder = VarianceThreshold(0.75 * (1 - 0.75))
features_high_variance = thresholder.fit_transform(features)
features_high_variance

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [8]:
thresholder.fit(features).variances_

array([0.16, 0.16, 0.24])

In [9]:
thresholder = VarianceThreshold(threshold=.9 * (1 - .9))
print("VT result for Material Fatigue")
Xf_features_high_variance = thresholder.fit_transform(Xf)
print(Xf_features_high_variance.shape)
print('\n')

VT result for Material Fatigue
(437, 14)




In [10]:
print("Variance of Fatugue Data")
print(thresholder.fit(Xf).variances_)
print('\n')

print("Variance of Kappa Data")
print(thresholder.fit(Xk).variances_)

Variance of Fatugue Data
[6.85500526e+02 7.82410122e+04 1.05105017e+02 6.13474229e+01
 7.91977756e+04 1.60730141e+04 7.11945768e+04 2.45928321e+02
 3.76245359e+02 2.68678309e+04 4.60233860e+02 6.50114102e+01
 9.26471103e-03 7.25902728e-01 1.69165923e-01 7.74801250e-03]


Variance of Kappa Data
[2.47646484e-01 5.31839844e+00 4.78655859e+01 1.54351715e+03
 8.37815954e-01 8.31401508e-01 1.95799691e+00 1.53808594e+02
 1.53808594e+02 4.74750000e+02 3.54020194e+00 3.18413379e+03
 2.47495301e+02 3.36530771e+00 8.63942685e+01 4.14071787e+02
 1.54032412e+02 1.70491485e+02 1.58746484e-01 1.16105371e-01
 6.09313283e-02 1.15121094e+00 1.05389648e+00 6.35691904e-01
 7.19996094e+00 1.45183496e+01 8.33767374e+00 2.86790430e+03
 8.70593332e+02 1.09367015e+03 6.54301277e-01 2.10370207e-01
 2.16011763e-01 4.19964840e-01 1.22278827e-01 1.33775910e-01]


In [11]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
Xf_std = scaler.fit_transform(Xf)

selector = VarianceThreshold()

# output 1 means that feature noramlization is scuccessful
selector.fit(Xf_std).variances_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
scaler = StandardScaler()
Xk_std = scaler.fit_transform(Xk)

selector = VarianceThreshold()

# output 1 means that feature noramlization is scuccessful
selector.fit(Xk_std).variances_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1.])

In [14]:
MAX_b = []
MIN_b = []

for j in range(len(Xk.columns)):
    MAX_b.append(Xk[Xk.columns[j]].max())
    MIN_b.append(Xk[Xk.columns[j]].min())


print("Before Scaling")
print("Maxes: {}".format(max(MAX_b)))
print("Mins: {}".format(min(MIN_b)))


MAX_a = []
MIN_a = []

for i in range(len(Xk_std)):
    MAX_a.append(max(Xk_std[i]))
    MIN_a.append(min(Xk_std[i]))

print('\n')
print("After Scaling")
print("Maxes: {}".format(max(MAX_a)))
print("Mins: {}".format(min(MIN_a)))

Before Scaling
Maxes: 389.634915603905
Mins: -82.56689644


After Scaling
Maxes: 6.539535537180673
Mins: -5.476052162160264


In [15]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFECV
from sklearn import linear_model

In [16]:
ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")

rfecv.fit(Xf, yf)
rfecv.transform(Xf)
C = rfecv.n_features_
D = Xf.shape[1] - C
R = rfecv.ranking_
print("Results for Materials Fatigue")
print("Number of chosen features {0}, eliminated features {1},\nfeature rankings {2} ".format(C, D, R))

ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")

rfecv.fit(Xk, yk)
rfecv.transform(Xk)
C = rfecv.n_features_
D = Xk.shape[1] - C
R = rfecv.ranking_
print('\n')
print("Results for Thermal Conductance")
print("Number of chosen features {0}, eliminated features {1},\nfeature rankings {2} ".format(C, D, R))

Results for Materials Fatigue
Number of chosen features 5, eliminated features 11,
feature rankings [ 7  6  3  5  2 12  9 11 10  8  1  4  1  1  1  1] 


Results for Thermal Conductance
Number of chosen features 29, eliminated features 7,
feature rankings [1 5 4 7 1 1 1 1 1 1 1 8 6 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1] 


In [17]:
# Principal Component Analysis
from sklearn.decomposition import PCA

In [24]:
pca = PCA(n_components=3, whiten=True)

# compute principal components
Xf_pca = pca.fit_transform(Xf)
Xk_pca = pca.fit_transform(Xk)

print("Materials Fatigue")
print("Original number of features:", Xf.shape[1])
print("Number of features after reduction:", Xf_pca.shape[1])

print('\n')
print("Thermal Conductance")
print("Original number of features:", Xk.shape[1])
print("Number of features after reduction:", Xk_pca.shape[1])

Materials Fatigue
Original number of features: 16
Reduced number of features: 3


Thermal Conductance
Original number of features: 36
Reduced number of features: 3


In [21]:
dataf = pd.read_csv('fatigue_data.csv')

principalComponents = pca.fit_transform(Xf)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, dataf[['Fatigue']]], axis = 1)
finalDf

Unnamed: 0,principal component 1,principal component 2,Fatigue
0,0.815845,5.997737,232
1,0.815845,5.997736,235
2,0.815845,5.997737,235
3,0.815845,5.997737,241
4,0.815845,5.997737,225
...,...,...,...
432,2.802381,-0.496030,1030
433,2.802381,-0.496031,957
434,2.774571,-0.647919,1104
435,2.774571,-0.647917,1008


In [None]:
# Linear Discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
lda = LinearDiscriminantAnalysis(n_components=5)
Xf_lda = lda.fit(Xf,yf).transform(Xf)

print("Original number of features:", Xf.shape[1])
print("Reduced number of features:", Xf_lda.shape[1])