In [13]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from numpy import asarray
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

In [2]:
columns = ['CLOC', 'LC', 'MLOC', 'LM', 'LMC', 'LMCS', 'LPLPAR', 'LPL', 'NOC', 'LLFPAR', 'NOO', 'LLF']
df = pd.read_csv('code_smell_dataset.csv', usecols=columns)
df

Unnamed: 0,CLOC,LC,MLOC,LM,LMC,LMCS,LPLPAR,LPL,NOC,LLFPAR,NOO,LLF
0,38.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44.0,1.0,34.0,0.0,2.0,0.0,13.0,1.0,0.0,0.0,0.0,0.0
3,44.0,1.0,34.0,0.0,2.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0
4,44.0,1.0,34.0,0.0,2.0,0.0,13.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
375525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,1.0,7.0,0.0
375526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,2.0,0.0
375527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,2.0,5.0,0.0
375528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,4.0,0.0


In [3]:
# seperating data and labels 
X = df.drop(columns=['LC', 'LM', 'LMCS', 'LPL', 'LLF'], axis=1)
Y = df.drop(columns=['CLOC', 'MLOC', 'LMC', 'LPLPAR', 'NOC', 'LLFPAR', 'NOO'], axis=1)
print(X.head())
print(Y.head())

   CLOC  MLOC  LMC  LPLPAR  NOC  LLFPAR  NOO
0  38.0   0.0  0.0     0.0  0.0     0.0  0.0
1  42.0   0.0  0.0     0.0  0.0     0.0  0.0
2  44.0  34.0  2.0    13.0  0.0     0.0  0.0
3  44.0  34.0  2.0     6.0  0.0     0.0  0.0
4  44.0  34.0  2.0    13.0  0.0     0.0  0.0
    LC   LM  LMCS  LPL  LLF
0  1.0  0.0   0.0  0.0  0.0
1  1.0  0.0   0.0  0.0  0.0
2  1.0  0.0   0.0  1.0  0.0
3  1.0  0.0   0.0  1.0  0.0
4  1.0  0.0   0.0  1.0  0.0


In [4]:
# Handling Outliers
def handle_outliers(df, IQR, variable):
    lower_bridge = df[variable].quantile(0.25) - (IQR*1.5)
    upper_bridge = df[variable].quantile(0.75) + (IQR*1.5)
    
    df.loc[df[variable] >= upper_bridge, variable] = upper_bridge
    df.loc[df[variable] <= lower_bridge, variable] = lower_bridge

IQR_CLOC = df['CLOC'].quantile(0.75) - df['CLOC'].quantile(0.25)
IQR_MLOC = df['MLOC'].quantile(0.75) - df['MLOC'].quantile(0.25)
IQR_LMC = df['LMC'].quantile(0.75) - df['LMC'].quantile(0.25)
IQR_LPLPAR = df['LPLPAR'].quantile(0.75) - df['LPLPAR'].quantile(0.25)
IQR_NOC = df['NOC'].quantile(0.75) - df['NOC'].quantile(0.25)
IQR_LLFPAR = df['LLFPAR'].quantile(0.75) - df['LLFPAR'].quantile(0.25)
IQR_NOO = df['NOO'].quantile(0.75) - df['NOO'].quantile(0.25)

handle_outliers(X, IQR_CLOC, 'CLOC')
handle_outliers(X, IQR_MLOC, 'MLOC')
handle_outliers(X, IQR_LMC, 'LMC')
handle_outliers(X, IQR_LPLPAR, 'LPLPAR')
handle_outliers(X, IQR_NOC, 'NOC')
handle_outliers(X, IQR_LLFPAR, 'LLFPAR')
handle_outliers(X, IQR_NOO, 'NOO')

In [None]:
# train test split 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
# scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Visualize the effect of scaling
plt.subplots(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.boxplot(data=X_train)
plt.title('X_train Before Scaling')
plt.subplot(1, 2, 2)
sns.boxplot(data=X_train_scaled)
plt.savefig("scaling.png")
plt.title('X_train After Scaling')

In [None]:
# Create and train model
model = Sequential()
model.add(Dense(6, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_scaled, Y_train, epochs=10, batch_size=16, validation_data=(X_test_scaled, Y_test))

loss, accuracy = model.evaluate(X_test_scaled, Y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

In [None]:
# Make predictions on new data
new_data = np.array([[100, 100, 6, 7, 148, 7, 19]])
new_data_scaled = scaler.transform(new_data)
predictions = model.predict(new_data_scaled)

print(f'Predicted Labels: {predictions}')