In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import IsolationForest,RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTENC

Read data

In [None]:
df=pd.read_excel('data.xlsx',sheet_name=1)
print('Original data shape:',df.shape)
df.rename(columns={'Total aggregate':'TA','Coarse aggregate':'CA','Fine aggregate':'FA'},
          inplace=True)
print('Duplicate rows:',df.duplicated().sum())

Missing values

In [None]:
len(df)-df.count()

In [None]:
list(enumerate(df.columns))

Preprocessing

In [None]:
# fill cols with mean
df.iloc[:,11:21]=SimpleImputer(missing_values=np.nan).fit_transform(df.iloc[:,11:21])
df=df.dropna(subset=['FA','CA','TA','Dnssm'])
print('After removing missing data:',df.shape)
print('Missing values:')
print([(col,len(df)-df[col].count()) for col in df.columns if len(df)>df[col].count()])

# encode target feature
df['Migration resistance']=df['Migration resistance'].map({'Low':0,'Moderate':1,'High':2,'Very high':3,'Extremely high':4})
df=df.drop('TA',axis=1)

print('Final shape:',df.shape)

Impute cement type

In [None]:
# label encode cement types
X_le=LabelEncoder()
df2=pd.Series(X_le.fit_transform(df.iloc[:,2]))
df.iloc[:,2]=df2.replace(to_replace=X_le.transform([np.nan]), value=np.nan)
# impute cement types
df.iloc[:,2]=KNNImputer(n_neighbors=1).fit_transform(df)[:,2]
print(df['Migration resistance'].value_counts())

Descriptive Stats

In [None]:
df.describe().T

Correlation Heatmap

In [None]:
plt.figure(figsize=(25,25))
sns.heatmap(df.corr(),cmap='coolwarm',linewidths=2,annot=True)

Discrete Features

In [None]:
discrete_cols=[col for col in df.columns if df[col].unique().size<15]
continuous_cols=[col for col in df.columns if col not in discrete_cols]
discrete_cols

Pair plots

In [None]:
sns.pairplot(df[continuous_cols],corner=True)

Data Distribution Plots

In [None]:
for col in continuous_cols: sns.displot(df[col],kde=True)

Box plots

In [None]:
for col in continuous_cols:
    plt.title(col)
    sns.boxplot(df[col])
    plt.show()

Drop target vector (Dnssm)

In [None]:
df=df.drop('Dnssm',axis=1)
continuous_cols=[col for col in df.columns if col not in discrete_cols]

Log transformation

In [None]:
for col in continuous_cols:
    if 0 in df[col].unique(): continue
    series=pd.Series(np.log(df[col]))
    plt.title(col)
    plt.subplot(1,2,1)
    series.hist()

    plt.subplot(1,2,2)
    stat.probplot(series,plot=plt,rvalue=True)
    plt.show()

Exponential transformation

In [None]:
for col in continuous_cols:
    series=pd.Series(df[col]**2)
    plt.title(col)
    plt.subplot(1,2,1)
    series.hist()

    plt.subplot(1,2,2)
    stat.probplot(series,plot=plt,rvalue=True)
    plt.show()

Inverse transformation

In [None]:
for col in continuous_cols:
    if 0 in df[col].unique(): continue
    series=pd.Series(1/df[col])
    plt.title(col)
    plt.subplot(1,2,1)
    series.hist()

    plt.subplot(1,2,2)
    stat.probplot(series,plot=plt,rvalue=True)
    plt.show()

Square root transformation

In [None]:
for col in continuous_cols:
    series=pd.Series(df[col]**.5)
    plt.title(col)
    plt.subplot(1,2,1)
    series.hist()

    plt.subplot(1,2,2)
    stat.probplot(series,plot=plt,rvalue=True)
    plt.show()

Outlier detection

In [None]:
# https://youtu.be/O9VvmWj-JAk?si=21pVYN76owRisS1b
anomalies=IsolationForest(contamination=0.1,random_state=0).fit_predict(df.loc[:,df.columns!='Cement type'])
print('Outliers:',(anomalies==-1).sum())
df=df[anomalies!=-1]
print('Final shape:',df.shape)
# categorical count
df['Migration resistance'].value_counts()

Apply SMOTE

In [None]:
X,y=SMOTENC(categorical_features=['Cement type'],random_state=0).fit_resample(df.iloc[:,:-1],df.iloc[:,-1])
df=X
df['Migration resistance']=y
df=df[~df.duplicated()]
print(df.shape)
# categorical count
df['Migration resistance'].value_counts()

Category encoding

In [None]:
df_encoded=OneHotEncoder(['Cement type']).fit_transform(df)

Model training using RF embedded approach to get feature importances

In [None]:
X=df_encoded.iloc[:,:-1].drop('Cement type_1',axis=1)
y=df.iloc[:,-1]
sc=StandardScaler()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
rfc=RandomForestClassifier(15,random_state=0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)

Feature importance by removing cement type

In [None]:
X=df.iloc[:,:-1].drop('Cement type',axis=1)
y=df.iloc[:,-1]
sc=StandardScaler()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
rfc=RandomForestClassifier(100,random_state=0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)

Save processed data

In [None]:
df.to_csv('processed.csv',index=False)