In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTENC

Read data

In [None]:
df=pd.read_excel('data.xlsx',sheet_name=1)
print('Original data shape:',df.shape)
df.rename(columns={'Total aggregate':'TA','Coarse aggregate':'CA','Fine aggregate':'FA'},
          inplace=True)
print('Duplicate rows:',df.duplicated().sum())

In [None]:
list(enumerate(df.columns))

Preprocessing

In [None]:
# fill cols with mean
df.iloc[:,11:21]=SimpleImputer(missing_values=np.nan).fit_transform(df.iloc[:,11:21])
df=df.dropna(subset='Dnssm')
df=df.dropna(subset=['FA','CA','TA'])
print('After removing missing data:',df.shape)
print('Missing values:')
print([(col,len(df)-df[col].count()) for col in df.columns if len(df)>df[col].count()])

# encode target feature
df['Migration resistance']=df['Migration resistance'].map({'Low':0,'Moderate':1,'High':2,'Very high':3,'Extremely high':4})
df=df.drop(['TA','Dnssm'],axis=1)

print('Final shape:',df.shape)

Pair plots

In [None]:
# sns.pairplot(df,corner=True)

Impute cement type

In [None]:
cement_types=len(df['Cement type'].unique())-1  #exclude nan
# label encode cement types
X_le=LabelEncoder()
df2=pd.Series(X_le.fit_transform(df.iloc[:,2]))
df.iloc[:,2]=df2.replace(to_replace=X_le.transform([np.nan]), value=np.nan)
# impute cement types
df.iloc[:,2]=KNNImputer(n_neighbors=1).fit_transform(df)[:,2]
print(df['Migration resistance'].value_counts())

Descriptive Stats

In [None]:
df.describe().T

Correlation Heatmap

In [None]:
plt.figure(figsize=(25,25))
sns.heatmap(df.corr(),cmap='coolwarm',linewidths=2,annot=True)

Discrete Features

In [None]:
discrete_cols=[col for col in df.columns if df[col].unique().size<15]
discrete_cols

Data Distribution Plots

In [None]:
# for col in df.columns:
#     if col in discrete_cols: continue
#     sns.displot(df[col],kde=True)

Box plots

In [None]:
# for col in df.columns:
#     if col in discrete_cols: continue
#     plt.title(col)
#     sns.boxplot(df[col])
#     plt.show()

Outlier detection

In [None]:
# https://youtu.be/O9VvmWj-JAk?si=21pVYN76owRisS1b
anomalies=IsolationForest(contamination=0.1,random_state=0).fit_predict(df.loc[:,df.columns!='Cement type'])
print('Outliers:',(anomalies==-1).sum())
df=df[anomalies!=-1]
print('Final shape:',df.shape)
# categorical count
df['Migration resistance'].value_counts()

Apply SMOTE

In [None]:
X,y=SMOTENC(categorical_features=['Cement type'],random_state=0).fit_resample(df.iloc[:,:-1],df.iloc[:,-1])
print(X.shape)
print(y.shape)
df=X
df['Migration resistance']=y
# categorical count
df=df[~df.duplicated()]
df.to_csv('processed.csv',index=False)
df['Migration resistance'].value_counts()