### The dataset consists of data collected from heavy Scania trucks in everyday usage. The system in focus is the Air Pressure system (APS) which generates pressurized air that is utilized in various functions in a truck, such as breaking and gear changes. The datasets' positive class consists of component failures for a specific component of the APS system. The negative class consists of trucks with failures for components not related to the APS. 
### The training set contains 60000 examples in total in which 59000 belong to the negative class and 1000 positive class. The test set contains 16000 examples. There are 171 attributes per record.



In [1]:
#Import required libraries
import numpy as np 
import pandas as pd
import scipy.stats as s
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
df_train=pd.read_csv("aps_failure_training_set.csv",header='infer',skiprows=20)
df_test=pd.read_csv("aps_failure_test_set.csv",header='infer',skiprows=20)

In [4]:
df_train.shape

(60000, 171)

In [5]:
df_test.shape

(16000, 171)

In [6]:
df_train.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [7]:
df_test.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,neg,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0
2,neg,66002,2,212,112,0,0,0,0,0,...,495076,380368,440134,269556,1315022,153680,516,0,0,0
3,neg,59816,na,1010,936,0,0,0,0,0,...,540820,243270,483302,485332,431376,210074,281662,3232,0,0
4,neg,1814,na,156,140,0,0,0,0,0,...,7646,4144,18466,49782,3176,482,76,0,0,0


In [8]:
data=pd.concat([df_train,df_test])
data.head()



Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [9]:
data.replace('na',np.nan,inplace=True)


In [10]:

# replace 'neg' with 0 and 'pos' with 1 in 'Class column'
data['class'].replace( {'neg': 0, 'pos': 1}, inplace=True)
data.head()


Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,,2130706438,280.0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,0,33058,,0,,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,0,41040,,228,100.0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,0,12,0.0,70,66.0,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,0,60874,,1368,458.0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [11]:
# Calculating the percentage of values missing in each column of our dataset
percent_missing= (data.isnull().sum() * 100) / len(data)
idx= np.argsort(-percent_missing)     #sorting the percent values in descending order
columns= data.columns
sorted(percent_missing)[:158:-1]

[82.09605263157894,
 81.18815789473685,
 79.55394736842105,
 77.24868421052632,
 77.22631578947369,
 77.22631578947369,
 73.31842105263158,
 65.91447368421052,
 45.39868421052632,
 38.32631578947368,
 24.792105263157893,
 24.792105263157893]

In [12]:
#drop columns which has more than 80% nan value
columns_to_drop = list(percent_missing[percent_missing >= 30].index)
data_new = data.drop(columns_to_drop, axis=1)


In [13]:
columns_to_drop

['ab_000',
 'bk_000',
 'bl_000',
 'bm_000',
 'bn_000',
 'bo_000',
 'bp_000',
 'bq_000',
 'br_000',
 'cr_000']

In [14]:
data_new

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,2130706438,280,0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,0,33058,0,,0,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,0,41040,228,100,0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,0,12,70,66,0,10,0,0,0,318,...,240,46,58,44,10,0,0,0,4,32
4,0,60874,1368,458,0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0,81852,2130706432,892,0,0,0,0,0,0,...,632658,273242,510354,373918,349840,317840,960024,25566,0,0
15996,0,18,52,46,8,26,0,0,0,0,...,266,44,46,14,2,0,0,0,0,0
15997,0,79636,1670,1518,0,0,0,0,0,0,...,806832,449962,778826,581558,375498,222866,358934,19548,0,0
15998,0,110,36,32,0,0,0,0,0,0,...,588,210,180,544,1004,1338,74,0,0,0


In [15]:
columns=data.columns

In [16]:
data = data.fillna(0)

In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 

In [18]:
imputer = imputer.fit(data)                                   #fitting the imputer on the train data
data= imputer.transform(data)                        #transforming on the train data and test data

In [19]:
df = pd.DataFrame(data, columns=columns)     #preparing new dataframe with imputations and undropped attributes

In [20]:
X=df.drop('class',axis=1)
y=df['class']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70, random_state=42)


## classification using Naive Bayes 

In [22]:
from sklearn.naive_bayes import GaussianNB 
model = GaussianNB()

In [23]:
model.fit(X_train,y_train)

GaussianNB()

In [25]:
y_pred = model.predict(X_test)

In [26]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98     50777
         1.0       0.89      0.35      0.51      2423

    accuracy                           0.97     53200
   macro avg       0.93      0.68      0.75     53200
weighted avg       0.97      0.97      0.96     53200



## classification using Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [28]:
model.fit(X_train, y_train)

LogisticRegression()

In [29]:
y_pred = model.predict(X_test) #Predict the response for test dataset

In [31]:
Y_pred = y_pred.reshape(-1)

In [32]:
print(classification_report(Y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     52317
         1.0       0.57      0.63      0.60       883

    accuracy                           0.99     53200
   macro avg       0.78      0.81      0.79     53200
weighted avg       0.99      0.99      0.99     53200



## classification using svm

In [33]:
from sklearn.svm import SVC
model =SVC()

In [34]:
model.fit(X_train,y_train)

SVC()

In [35]:
y_pred = model.predict(X_test)

In [36]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     53187
         1.0       0.01      0.69      0.02        13

    accuracy                           0.98     53200
   macro avg       0.50      0.84      0.50     53200
weighted avg       1.00      0.98      0.99     53200

