In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from sklearn.ensemble import IsolationForest,RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

Read data

In [None]:
df=pd.read_excel('data.xlsx')
# df

Sample df

In [None]:
df.sample(5).T

In [None]:
print('Original data shape:',df.shape)
print('Duplicate rows:',df.duplicated().sum())
df=df[~df.duplicated()]
print('Shape after removing duplicates:',df.shape)

Missing values

In [None]:
len(df)-df.count()

In [None]:
list(enumerate(df.columns))

Discrete Features

In [None]:
discrete_cols=[col for col in df.columns if df[col].unique().size<15]
continuous_cols=[col for col in df.columns if col not in discrete_cols]
discrete_cols

Descriptive Stats

In [None]:
df.describe()

In [None]:
df[continuous_cols].describe().T

Correlation Heatmap

In [None]:
plt.figure(dpi=200)
sns.heatmap(df.corr(),cmap='coolwarm',linewidths=2,annot=True)

Pair plots

In [None]:
plt.figure(dpi=200)
sns.pairplot(df,corner=True,plot_kws={'alpha': 0.3})

Data Distribution Plots

In [None]:
for col in df.columns: sns.displot(df[col],kde=True)

Box plots

In [None]:
for col in df.columns:
    plt.title(col)
    sns.boxplot(df[col])
    plt.show()

Log transformation

In [None]:
for col in df.columns:
    if 0 in df[col].unique(): continue
    series=pd.Series(np.log(df[col]))
    # plt.figure(figsize=(10,10))
    plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
    plt.title(col)
    ax1=plt.subplot(1,2,1)
    series.hist()

    ax2=plt.subplot(1,2,2)
    stat.probplot(series,plot=plt,rvalue=True)
    plt.show()

Exponential transformation

In [None]:
# for col in continuous_cols:
#     series=pd.Series(df[col]**2)
#     plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
#     plt.title(col)
#     plt.subplot(1,2,1)
#     series.hist()

#     plt.subplot(1,2,2)
#     stat.probplot(series,plot=plt,rvalue=True)
#     plt.show()

Inverse transformation

In [None]:
# for col in continuous_cols:
#     if 0 in df[col].unique(): continue
#     series=pd.Series(1/df[col])
#     plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
#     plt.title(col)
#     plt.subplot(1,2,1)
#     series.hist()

#     plt.subplot(1,2,2)
#     stat.probplot(series,plot=plt,rvalue=True)
#     plt.show()

Square root transformation

In [None]:
# for col in continuous_cols:
#     series=pd.Series(df[col]**.5)
#     plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
#     plt.title(col)
#     plt.subplot(1,2,1)
#     series.hist()

#     plt.subplot(1,2,2)
#     stat.probplot(series,plot=plt,rvalue=True)
#     plt.show()

Save processed data

In [None]:
df.to_csv('processed_outliers.csv',index=False)

Model training using RF embedded approach to get feature importances

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
sc=StandardScaler()
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.1, random_state=0)
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
rfc=RandomForestRegressor(100,random_state=0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(r2_score(y_test,y_pred))

plot feature bar plot

In [None]:
# plt.figure(figsize=(10,10),dpi=200)
pd.Series(rfc.feature_importances_,index=X.columns).sort_values().plot.barh()
plt.xlabel('Feature importance measure')

Outlier detection

In [None]:
# https://youtu.be/O9VvmWj-JAk?si=21pVYN76owRisS1b
anomalies=IsolationForest(contamination=0.1,random_state=0).fit_predict(df)
print('Outliers:',(anomalies==-1).sum())
df=df[anomalies!=-1]
print('Final shape:',df.shape)
# df.to_csv('processed.csv',index=False)