In [1]:
import numpy as np
import pandas as pd

truck = pd.read_csv('fleet_truck.csv', header = 0)

### 1. Select columns with more than one distinct observations (0:26)

In [2]:
truck = truck.iloc[:,0:26]
truck.shape

(8307, 26)

### 2. Drop one of the collinear pairs

In [3]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(truck, 10))

Top Absolute Correlations
Vibration        Speed_OBD        1.000000
Speed_sensor     Vibration        1.000000
                 Speed_OBD        1.000000
Intake_Pressure  Turbo_Boost      0.999954
Litres_Per_km    CO2              0.998375
Trip_Distance    Trip_Time        0.994790
Speed_OBD        Speed_GPS        0.985869
Speed_sensor     Speed_GPS        0.985869
Vibration        Speed_GPS        0.985869
GPS_Altitude     Trip_Distance    0.868121
dtype: float64


In [4]:
# threshold 0.9
truck = truck.drop(['Vibration', 'Speed_OBD', 'Turbo_Boost', 'CO2', \
                   'Speed_GPS', 'Trip_Time'], axis = 1)
truck.shape

(8307, 20)

### 3. Look at coefficient of variation

If the interval features are likely to be constant, drop it

In [5]:
result = pd.DataFrame()
for thisVar in truck.iloc[:,1:26].columns:
    minimum = min(truck[thisVar])
    maximum = max(truck[thisVar])
    mean = np.mean(truck[thisVar])
    std = np.std(truck[thisVar], ddof = 1)
    coefVar = std / max(1.0, abs(mean))
    if mean < 0:
        coefVar = -coefVar
  
    result = result.append([[thisVar, minimum, maximum, mean, std, \
                            coefVar]],ignore_index = True)

result = result.rename(columns= {0:'Feature Field', 1: 'minimum', 
                                 2: 'maximum', 3: 'mean',
                                 4: 'std', 5 :'Coefficient of Variation'})
result
result.sort_values(by='Coefficient of Variation')

Unnamed: 0,Feature Field,minimum,maximum,mean,std,Coefficient of Variation
18,Accel_Ssor_Total,-0.735121,0.46005,-0.003159,0.063671,-0.063671
13,GPS_Latitude,47.022808,49.81354,48.547051,0.571127,0.011764
8,Voltage,9.94,14.46,14.171672,0.219311,0.015475
2,Coolant_Temp,79.0,94.0,87.736006,2.20052,0.025081
11,Engine_Oil_Temp,74.0,92.0,84.526905,2.318508,0.027429
12,GPS_Longitude,6.537594,10.216362,8.56992,0.640588,0.074748
7,Throttle_Pos,9.803922,89.01961,75.258476,9.414729,0.125099
9,Ambient,5.0,9.0,7.309739,0.915003,0.125176
3,Intake_Pressure,99.0,255.0,119.41134,20.573149,0.172288
5,Intake_Air,6.0,18.0,10.664139,2.500781,0.234504


In [6]:
# threshold 0.1
truck = truck.drop(['GPS_Latitude', 'Voltage', 'Coolant_Temp', 'Engine_Oil_Temp',\
                    'Accel_Ssor_Total', 'GPS_Longitude'], axis = 1)
truck.shape

(8307, 14)

### 4. Look at variety among catagories

In [7]:
result = pd.DataFrame()
for thisVar in truck.iloc[:,1:26].columns:
    thisDType = truck[thisVar].dtypes
    
    
    nRow = truck.shape[0]
    nNaN = truck[thisVar].isna().sum()
    percentNaN = 100.0 * (nNaN / nRow)
    
    entropy = np.NaN
    persentEntropy = np.NaN
    mean = np.NaN
    coefVar = np.NaN
    
    uniqueValue = truck[thisVar].value_counts()
    nValid = np.sum(uniqueValue)
    uniqueProp = uniqueValue / nValid
    nUnique = uniqueValue.size
    
    entropy = - np.sum(uniqueProp * np.log2(uniqueProp))
    if nUnique > 1:
        e0 = nValid - nUnique + 1
        e0 = np.log2(nValid) - (e0 / nValid) * np.log2(e0)
        e1 = np.log2(nUnique)
        if e1 > e0:
            percentEntropy = 100.0 * ((entropy - e0) / (e1 - e0))
    
    
    result = result.append([[thisVar, thisDType, nValid, nNaN, percentNaN, \
                             nUnique, entropy, percentEntropy]],ignore_index = True)

result = result.rename(columns= {0:'Feature Field', 1: 'DType', 
                                 2: 'Number of Valids', 3: 'Number of NaNs',
                                 4: 'Percent of NaNs', 5: 'Number of Unique Values',
                                 6: 'Entropy', 7: 'Percent of Entropy'})
result
result.sort_values(by='Percent of Entropy')

Unnamed: 0,Feature Field,DType,Number of Valids,Number of NaNs,Percent of NaNs,Number of Unique Values,Entropy,Percent of Entropy
12,Litres_Per_km,float64,8149,158,1.90201,5399,10.091617,29.308033
8,Accel,float64,8307,0,0.0,114,3.354689,47.588151
6,Throttle_Pos,float64,8307,0,0.0,127,3.539117,49.042356
2,Intake_Pressure,int64,8307,0,0.0,148,5.327404,72.934743
7,Ambient,int64,8307,0,0.0,5,1.905363,82.005557
10,GPS_Altitude,int64,8307,0,0.0,628,8.157356,86.143091
4,Intake_Air,int64,8307,0,0.0,13,3.212051,86.726943
11,Trip_Distance,float64,8307,0,0.0,7171,12.685931,89.634428
1,Engine_Load,float64,8307,0,0.0,256,7.217843,89.649525
5,Flow_Rate,float64,8307,0,0.0,1488,9.866022,91.559266


### Drop missing values

In [8]:
truck = truck.dropna(axis = 0)
truck.shape

(8149, 14)

In [9]:
truck.head()

Unnamed: 0,Maintenance_flag,Speed_sensor,Engine_Load,Intake_Pressure,Engine_RPM,Intake_Air,Flow_Rate,Throttle_Pos,Ambient,Accel,GPS_Bearing,GPS_Altitude,Trip_Distance,Litres_Per_km
0,0,35,21.568628,116,1115.5,10,18.33,80.0,7,27.843138,75.2,164,310.262,2.351502
1,0,142,20.392157,135,1782.5,16,35.41,80.0,8,34.509804,274.4,436,161.02463,1.244649
2,0,128,43.52941,109,1588.0,9,27.08,80.0,8,14.901961,257.1,508,158.23788,2.1241
3,0,117,99.60784,128,1899.5,13,36.99,80.0,7,43.92157,330.7,618,148.82233,0.0
4,0,98,73.333336,123,1230.5,9,21.63,80.0,7,14.901961,3.0,154,296.00378,4.031921


In [36]:
import random
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

In [37]:
features = truck.iloc[:,1:]
targets = truck.iloc[:,:1]

In [38]:
kmeans = KMeans(n_clusters=2).fit(features)

In [39]:
pred = pd.DataFrame(kmeans.labels_,columns=["Group"])

In [40]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(targets, pred).ravel()
print(confusion_matrix(targets, pred))
print(tn,fp,fn,tp)
print("accuracy rate:", (tn+tp)/(tn+fp+fn+tp))

[[2175 4107]
 [1206  661]]
2175 4107 1206 661
accuracy rate: 0.3480181617376365


In [41]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=2,random_state=9).fit(features)
pred = pd.DataFrame(kmeans.labels_,columns=["Group"])

tn, fp, fn, tp = confusion_matrix(targets, pred).ravel()
print(confusion_matrix(targets, pred))
print(tn,fp,fn,tp)
print("accuracy rate:", (tn+tp)/(tn+fp+fn+tp))

[[3317 2965]
 [ 445 1422]]
3317 2965 445 1422
accuracy rate: 0.5815437476991042


In [42]:
from sklearn.cluster import Birch
kmeans =Birch(n_clusters=2).fit(features)
pred = pd.DataFrame(kmeans.labels_,columns=["Group"])

tn, fp, fn, tp = confusion_matrix(targets, pred).ravel()
print(confusion_matrix(targets, pred))
print(tn,fp,fn,tp)
print("accuracy rate:", (tn+tp)/(tn+fp+fn+tp))

[[4140 2142]
 [ 875  992]]
4140 2142 875 992
accuracy rate: 0.6297705239906737


In [17]:
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split as ts


X_train,X_test,y_train,y_test = ts(features,targets,test_size=0.3)

In [20]:
import warnings
from sklearn import metrics
warnings.filterwarnings('ignore')
# kernel = 'rbf'
clf_rbf = svm.SVC(kernel='rbf')
clf_rbf.fit(X_train,y_train)
y_pred = clf_rbf.predict(X_test)

score_rbf = clf_rbf.score(X_test,y_test)
print("The score of rbf is : %f"%score_rbf)
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

The score of rbf is : 0.759918
Accuracy: 0.7599182004089979


In [19]:
# kernel = 'poly'
clf_poly = svm.SVC(kernel='poly')
clf_poly.fit(X_train,y_train)
score_poly = clf_poly.score(X_test,y_test)
print("The score of poly is : %f"%score_poly)

The score of poly is : 0.769734
