In [55]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTENC

Read data

In [56]:
df=pd.read_excel('data.xlsx',sheet_name=1)
print('Original data shape:',df.shape)
df.rename(columns={'Total aggregate':'TA','Coarse aggregate':'CA','Fine aggregate':'FA'},
          inplace=True)
print('Duplicate rows:',df.duplicated().sum())

Original data shape: (843, 24)
Duplicate rows: 0


Missing values

In [57]:
len(df)-df.count()

w/b                       0
Water                     0
Cement type              11
Cement                    0
Slag                      0
Fly ash                   0
Silica fume               0
Lime filler               0
FA                      275
CA                      276
TA                      112
Plasticizer             152
Superplasticizer        338
Air entraining          263
Comp. str. test age     302
Compressive strength    304
Air content             525
Spreed                  565
Slump                   499
Fresh density           447
Dry  density            679
Migration test age        0
Dnssm                    35
Migration resistance     35
dtype: int64

In [58]:
list(enumerate(df.columns))

[(0, 'w/b'),
 (1, 'Water'),
 (2, 'Cement type'),
 (3, 'Cement'),
 (4, 'Slag'),
 (5, 'Fly ash'),
 (6, 'Silica fume'),
 (7, 'Lime filler'),
 (8, 'FA'),
 (9, 'CA'),
 (10, 'TA'),
 (11, 'Plasticizer'),
 (12, 'Superplasticizer'),
 (13, 'Air entraining'),
 (14, 'Comp. str. test age'),
 (15, 'Compressive strength'),
 (16, 'Air content'),
 (17, 'Spreed'),
 (18, 'Slump'),
 (19, 'Fresh density'),
 (20, 'Dry  density'),
 (21, 'Migration test age'),
 (22, 'Dnssm'),
 (23, 'Migration resistance')]

Preprocessing

In [59]:
# fill cols with mean
df.iloc[:,11:21]=SimpleImputer(missing_values=np.nan).fit_transform(df.iloc[:,11:21])
df=df.dropna(subset=['FA','CA','TA','Dnssm'])
print('After removing missing data:',df.shape)
print('Missing values:')
print([(col,len(df)-df[col].count()) for col in df.columns if len(df)>df[col].count()])

# encode target feature
df['Migration resistance']=df['Migration resistance'].map({'Low':0,'Moderate':1,'High':2,'Very high':3,'Extremely high':4})
df=df.drop('TA',axis=1)

print('Final shape:',df.shape)

After removing missing data: (551, 24)
Missing values:
[('Cement type', 6)]
Final shape: (551, 23)


Impute cement type

In [60]:
# label encode cement types
X_le=LabelEncoder()
df2=pd.Series(X_le.fit_transform(df.iloc[:,2]))
df.iloc[:,2]=df2.replace(to_replace=X_le.transform([np.nan]), value=np.nan)
# impute cement types
df.iloc[:,2]=KNNImputer(n_neighbors=1).fit_transform(df)[:,2]
print(df['Migration resistance'].value_counts())

Migration resistance
2    232
0     93
1     91
3     84
4     51
Name: count, dtype: int64


Discrete Features

In [61]:
discrete_cols=[col for col in df.columns if df[col].unique().size<15]
discrete_cols

['Cement type',
 'Lime filler',
 'Plasticizer',
 'Air entraining',
 'Comp. str. test age',
 'Migration test age',
 'Migration resistance']

Drop target vector (Dnssm)

In [62]:
df=df.drop('Dnssm',axis=1)
continuous_cols=[col for col in df.columns if col not in discrete_cols]

Apply SMOTE

In [63]:
X,y=SMOTENC(categorical_features=['Cement type'],random_state=0).fit_resample(df.iloc[:,:-1],df.iloc[:,-1])
df=X
df['Migration resistance']=y
df=df[~df.duplicated()]
print(df.shape)
# categorical count
df['Migration resistance'].value_counts()

(1159, 22)


Migration resistance
3    232
2    232
1    232
0    232
4    231
Name: count, dtype: int64

Category encoding

In [64]:
df_encoded=pd.get_dummies(df,drop_first=True,dtype=int)

Model training using RF embedded approach to get feature importances

In [65]:
X=df_encoded.drop('Migration resistance',axis=1)
y=df['Migration resistance']
sc=StandardScaler()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
rfc=RandomForestClassifier(100,random_state=0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)

0.8706896551724138


w/b                     0.098371
Migration test age      0.096882
Cement                  0.081170
Water                   0.070776
Superplasticizer        0.070382
Slump                   0.070160
CA                      0.069363
FA                      0.067478
Compressive strength    0.064707
Silica fume             0.045835
Fly ash                 0.040657
Slag                    0.037032
Air content             0.036888
Fresh density           0.034402
Dry  density            0.027327
Comp. str. test age     0.026241
Spreed                  0.017074
Air entraining          0.009187
Cement type_8.0         0.008463
Cement type_7.0         0.005167
Lime filler             0.004535
Plasticizer             0.004527
Cement type_1.0         0.003453
Cement type_6.0         0.003220
Cement type_3.0         0.002172
Cement type_10.0        0.001701
Cement type_9.0         0.001573
Cement type_4.0         0.000823
Cement type_5.0         0.000312
Cement type_2.0         0.000122
dtype: flo

Feature importance by removing cement type

In [67]:
X=df.drop(['Migration resistance','Cement type'],axis=1)
y=df['Migration resistance']
sc=StandardScaler()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
rfc=RandomForestClassifier(100,random_state=0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)

0.8706896551724138


w/b                     0.107135
Migration test age      0.101919
Cement                  0.085830
FA                      0.073865
Water                   0.071294
Superplasticizer        0.070927
Compressive strength    0.069711
CA                      0.069042
Slump                   0.058615
Silica fume             0.045170
Fly ash                 0.042661
Slag                    0.040662
Air content             0.038947
Fresh density           0.036266
Dry  density            0.028725
Comp. str. test age     0.025040
Spreed                  0.014569
Air entraining          0.010710
Lime filler             0.004687
Plasticizer             0.004226
dtype: float64

Save processed data

In [None]:
df.to_csv('processed_outliers.csv',index=False)