In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from IPython.display import display, HTML

#Colab use
#from google.colab import files
#uploaded = files.upload()
#import io

In [2]:
#standardization
def standard_scaler(X_train, X_test):
    
    scaler = StandardScaler()
    
    X_train = scaler.fit_transform(X_train) #standardize based on train data
    X_test = scaler.transform(X_test) #standardize test data
    
    return X_train, X_test

In [3]:
def evaluate(y_true, y_pred, label='test'):
    mse = mean_squared_error(y_true, y_pred) #calculate MSE
    rmse = np.sqrt(mse) #calculate RMSE
    variance = r2_score(y_true, y_pred) #calculate R2
    print('{} set RMSE:{}, R2:{}'.format(label, rmse, variance))

# Load Data 

In [4]:
index_names = ['ID', 'Cycle']
setting_names = ['OpSet1', 'OpSet2', 'OpSet3']
sensor_names = ['SensorMeasure{}'.format(i) for i in range(1,22)] 
col_names = index_names + setting_names + sensor_names

train_data = pd.read_csv("data/train_set.csv")
test_data = pd.read_csv("data/test_set.csv")
true_RUL = pd.read_csv("data/RUL_FD001.txt", sep='\s+', header = None)

#Colab use
#train_data = pd.read_csv(io.BytesIO(uploaded['train_set.csv']))
#test_data = pd.read_csv(io.BytesIO(uploaded['test_set.csv']))
#true_RUL = pd.read_csv(io.BytesIO(uploaded['RUL_FD001.txt']), sep='\s+', header = None)

train_RUL = train_data['RUL']
train_RUL = train_RUL.clip(upper = 125) #clip maximum cycle at 125
test_RUL = test_data['RUL']

train_data = train_data.drop(['RUL'], 1)
test_data = test_data.drop(['RUL'], 1)

test_data = test_data.groupby(['ID'])
test_data = test_data.tail(1)

#assign to new variable for easy understanding
train = train_data
train_y = train_RUL
test = test_data.groupby(['ID']).tail(1) #get the last record for each engine
test_y = true_RUL

#only sensor value considered
train = train[sensor_names]
test = test[sensor_names]

# Feature Extraction

In [5]:
#Principle Component Analysis
def pca(X_train, X_test, y_train, y_test):
    
    #for reproducible result
    np.random.seed(2)
    
    X_train, X_test = standard_scaler(X_train, X_test) #standardize data
    
    #rename columns after standardization
    sensor_names = ['SensorMeasure{}'.format(i) for i in range(1,22)] 
    X_train = pd.DataFrame(X_train, columns = sensor_names).reset_index(drop = True)
    X_test = pd.DataFrame(X_test, columns = sensor_names).reset_index(drop = True)

    #-------------------------------Fitting Model----------------------------
    # Make an instance of the Model
    pca = PCA(0.95, random_state = 1) #95% of the variance (information) is retained

    x1 = pca.fit_transform(X_train) #PCA based on train data
    x2 = pca.transform(X_test) #transform test data
    
    #Graph to indicate information contains in each component
    #fir = plt.figure(figsize=(8,5))
    #sing_vals = np.arange(len(pca.components_)) + 1
    #plt.plot(sing_vals, pca.explained_variance_ratio_, 'ro-', linewidth=2)
    #plt.title('Scree Plot', fontsize = 20)
    #plt.xlabel('Principal Component', fontsize = 20)
    #plt.ylabel('Eigenvalue', fontsize = 20)
    #plt.xticks(fontsize=10)
    #plt.yticks(fontsize=10)
    #------------------------------------------------------------------------
    
    #convert to data frame
    X_train = pd.DataFrame(data = x1)
    X_test = pd.DataFrame(data = x2)
    
    return X_train, X_test, y_train, y_test

# Polynomial Regression

In [6]:
def polynomial_regression(X_train, X_test, y_train, y_test):
    
    #for reproducible result
    np.random.seed(2)
    
    #------------------------------Train Model-------------------------------
    degree=4 #degree of polynomial

    polyreg_scaled=make_pipeline(PolynomialFeatures(degree),LinearRegression()) #pipeline to perform polynomial then linear regression
    polyreg_scaled.fit(X_train,y_train.values.ravel()) #train model
    #------------------------------------------------------------------------
    
    #------------------------------Predict X---------------------------------
    y_pred_train = polyreg_scaled.predict(X_train) #predict on train data
    y_pred_test = polyreg_scaled.predict(X_test) #predict on test data
    #------------------------------------------------------------------------
        
    #---------------------------------Accuracy-------------------------------
    # Use score method to get accuracy of model
    accuracy_score = polyreg_scaled.score(X_test, y_test)
    print('Accuracy of Polynomial Regression on test set: {:.2f}'.format(accuracy_score))
    #------------------------------------------------------------------------
    
    #--------------------------------RMSE & R2-------------------------------
    evaluate(y_train, y_pred_train, 'Train')
    evaluate(y_test, y_pred_test, 'Test')
    #------------------------------------------------------------------------
    
    filename = 'model/polynomial_model.sav'
    pickle.dump(polyreg_scaled, open(filename, 'wb')) # save polynomial regression model
    #files.download('polynomial_model.sav')

# Working

In [7]:
X_train, X_test, y_train, y_test = pca(train, test, train_y, test_y)

display(X_test)

polynomial_regression(X_train, X_test, y_train, y_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-2.442151,-0.629856,-0.269723,-1.036952,0.065579,0.256651,-0.335745,-0.786083,-0.363914,0.360124
1,-0.606344,-1.305484,-0.128590,-0.155242,-1.032065,-0.881564,0.273691,0.215089,-0.171814,0.083597
2,1.557391,-1.308425,0.016860,-0.463301,0.769709,0.797580,0.500090,-0.234719,1.525558,0.041484
3,1.351635,-0.751108,-0.019050,0.113222,0.769765,0.009556,-0.714867,-0.563234,-0.948301,-0.757999
4,0.194457,-1.186920,-0.085212,-0.011420,0.921937,-0.395308,-1.068561,0.150562,0.854153,0.523109
...,...,...,...,...,...,...,...,...,...,...
95,-3.480975,0.519087,-0.369665,0.987055,-0.419946,0.113710,-0.620419,0.204360,0.019912,-0.380007
96,-0.294138,1.057130,-0.233609,-1.517496,0.770914,-0.284645,-0.763970,-0.572922,-0.860816,0.487616
97,0.671958,0.055285,-0.075948,1.250506,0.522576,0.446222,-0.761574,-0.396403,0.491827,0.439716
98,-3.622111,0.695941,-0.401049,0.150557,-0.273977,-0.647349,0.223849,-0.233041,-0.067538,0.399699


Accuracy of Polynomial Regression on test set: 0.80
Train set RMSE:18.057976066501077, R2:0.8122261550830683
Test set RMSE:18.41727936179455, R2:0.8035774217749948
