# NASA Turbine Engine - Data Cleaning and Wrangling
**By Hanadi Matar**

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import pylab 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as stats

## <font color = red>Setting Columns Names
The data has no column names thus we need to assign column names to the columns in our data.

In [3]:
train = pd.read_csv(('./datasets/train_FD001.txt'), sep='\s+', header=None)

train.columns =['unit_nr', 'time_cycles', 'setting_1', 'setting_2', 'setting_3',
                's1', 's2', 's3','s4','s5','s6','s7','s8','s9','s10',
               's11', 's12', 's13','s14','s15','s16','s17','s18','s19','s20', 's21']

# Inspecting the Data
It is essential to understand the structure of the data at hand. Such as examining the first few and last rows, getting information about the data types, missing values and basic statistics for the numerical columns.

In [6]:
train.head(11) # displaying the first few rows of our dataset

Unnamed: 0,unit_nr,time_cycles,setting_1,setting_2,setting_3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694


In [5]:
train.tail(11) # displaying the last few rows of our dataset

Unnamed: 0,unit_nr,time_cycles,setting_1,setting_2,setting_3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
20620,100,190,-0.0001,0.0002,100.0,518.67,643.12,1594.45,1426.04,14.62,...,519.52,2388.26,8142.28,8.5162,0.03,395,2388,100.0,38.42,23.0603
20621,100,191,-0.0005,-0.0,100.0,518.67,643.69,1610.87,1427.19,14.62,...,519.8,2388.28,8143.56,8.5092,0.03,398,2388,100.0,38.39,23.1218
20622,100,192,-0.0009,0.0001,100.0,518.67,643.53,1601.23,1419.48,14.62,...,520.59,2388.21,8143.46,8.4892,0.03,397,2388,100.0,38.56,23.077
20623,100,193,-0.0001,0.0002,100.0,518.67,643.09,1599.81,1428.93,14.62,...,520.11,2388.19,8142.02,8.5424,0.03,397,2388,100.0,38.47,23.023
20624,100,194,-0.0011,0.0003,100.0,518.67,643.72,1597.29,1427.41,14.62,...,519.55,2388.22,8139.67,8.5215,0.03,394,2388,100.0,38.38,23.1324
20625,100,195,-0.0002,-0.0001,100.0,518.67,643.41,1600.04,1431.9,14.62,...,519.71,2388.28,8142.9,8.5519,0.03,394,2388,100.0,38.14,23.1923
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064


In [7]:
train.info() # to get information about the datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   unit_nr      20631 non-null  int64  
 1   time_cycles  20631 non-null  int64  
 2   setting_1    20631 non-null  float64
 3   setting_2    20631 non-null  float64
 4   setting_3    20631 non-null  float64
 5   s1           20631 non-null  float64
 6   s2           20631 non-null  float64
 7   s3           20631 non-null  float64
 8   s4           20631 non-null  float64
 9   s5           20631 non-null  float64
 10  s6           20631 non-null  float64
 11  s7           20631 non-null  float64
 12  s8           20631 non-null  float64
 13  s9           20631 non-null  float64
 14  s10          20631 non-null  float64
 15  s11          20631 non-null  float64
 16  s12          20631 non-null  float64
 17  s13          20631 non-null  float64
 18  s14          20631 non-null  float64
 19  s15 

In [14]:
train.describe().T # to get the basic statistics for the numerical columns in the dataset

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_nr,20631.0,0.510167,0.295229,0.0,0.252525,0.515152,0.767677,1.0
time_cycles,20631.0,0.298637,0.190806,0.0,0.141274,0.285319,0.429363,1.0
setting_1,20631.0,0.49949,0.125708,0.0,0.413793,0.5,0.586207,1.0
setting_2,20631.0,0.501959,0.244218,0.0,0.333333,0.5,0.75,1.0
setting_3,20631.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s1,20631.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s2,20631.0,0.443052,0.150618,0.0,0.335843,0.430723,0.539157,1.0
s3,20631.0,0.424746,0.133664,0.0,0.331807,0.415522,0.508829,1.0
s4,20631.0,0.450435,0.151935,0.0,0.339467,0.435348,0.545324,1.0
s5,20631.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Through data exploration, we were able to notice that each row has a unit_nr acting as the id of that row representing the engine ID. Thus the need to create a subdataset which groups the data by engine ID is present.

## <font color = red> Creating sub datasets for **engines**
Grouping by the engines (ID)

In [9]:
engine ={str(i): grp for i , grp in train.groupby('unit_nr')}

engine['1']

Unnamed: 0,unit_nr,time_cycles,setting_1,setting_2,setting_3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,1,188,-0.0067,0.0003,100.0,518.67,643.75,1602.38,1422.78,14.62,...,519.79,2388.23,8117.69,8.5207,0.03,396,2388,100.0,38.51,22.9588
188,1,189,-0.0006,0.0002,100.0,518.67,644.18,1596.17,1428.01,14.62,...,519.58,2388.33,8117.51,8.5183,0.03,395,2388,100.0,38.48,23.1127
189,1,190,-0.0027,0.0001,100.0,518.67,643.64,1599.22,1425.95,14.62,...,520.04,2388.35,8112.58,8.5223,0.03,398,2388,100.0,38.49,23.0675
190,1,191,-0.0000,-0.0004,100.0,518.67,643.34,1602.36,1425.77,14.62,...,519.57,2388.30,8114.61,8.5174,0.03,394,2388,100.0,38.45,23.1295


# Handling Missing Values
Before proceeding to use the data we need to check for missing values to prevent any discrepencies. We can either fill the missing values with a value, for example the mean, median, etc. Or we can drop them.

In [10]:
# Checking for missing values
missing_values = train.isnull().sum()
print(missing_values)

unit_nr        0
time_cycles    0
setting_1      0
setting_2      0
setting_3      0
s1             0
s2             0
s3             0
s4             0
s5             0
s6             0
s7             0
s8             0
s9             0
s10            0
s11            0
s12            0
s13            0
s14            0
s15            0
s16            0
s17            0
s18            0
s19            0
s20            0
s21            0
dtype: int64


The dataset has no missing values and we can proceed with further data wrangling.

# Remove Duplicates
Removing duplicates to reduce discrepencies.

In [13]:
train.drop_duplicates(inplace = True)

# Normalize/Standardize Data
If the data will be used in machine learning algorithms, it is important to normalize it. Normalizing the data ensures consistency accross features, since features in a dataset can have different scales and units when we normalize the data we create a common range making them comparable.

In [11]:
# Importing necessary function for normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_columns = train.select_dtypes(include = ['float64','int64']).columns
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])

In [12]:
train.head()

Unnamed: 0,unit_nr,time_cycles,setting_1,setting_2,setting_3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,0.0,0.0,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,0.633262,0.205882,0.199608,0.363986,0.0,0.333333,0.0,0.0,0.713178,0.724662
1,0.0,0.00277,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,0.765458,0.279412,0.162813,0.411312,0.0,0.333333,0.0,0.0,0.666667,0.731014
2,0.0,0.00554,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,0.795309,0.220588,0.171793,0.357445,0.0,0.166667,0.0,0.0,0.627907,0.621375
3,0.0,0.00831,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,0.889126,0.294118,0.174889,0.166603,0.0,0.333333,0.0,0.0,0.573643,0.662386
4,0.0,0.01108,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.746269,0.235294,0.174734,0.402078,0.0,0.416667,0.0,0.0,0.589147,0.704502
