In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

df = pd.read_csv('labeled_data.csv')

df['Anomaly'] = df['Anomaly'].astype(float)

X = df.drop(['Anomaly'], axis=1)
y =df['Anomaly']

#calculating gini index
class_counts = df['Anomaly'].value_counts()
total_samples = len(df)
probabilities = class_counts / total_samples
gini = 1.0
for prob in probabilities:
    gini -= prob ** 2
print("gini index: ", gini)

X_train_ib, X_test, y_train_ib, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#applying smote to balance data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_ib, y_train_ib)

X_train = X_train_balanced
y_train = y_train_balanced

#did pruning by decreasing the tree depth from 29 to 18
model = DecisionTreeClassifier (max_depth=18, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report Decision Tree:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

#logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Repor Logistic Regression:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

#random forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report Random Forest:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

#KNN
knn_classifier = KNeighborsClassifier(n_neighbors=4)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)

print("Classification Report KNN:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

#naive bayes
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
y_pred = naive_bayes_classifier.predict(X_test)

print("Classification Report Naive Bayes:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

gini index:  0.4451103879180728
Classification Report Decision Tree:
              precision    recall  f1-score   support

         0.0       0.77      0.86      0.81       293
         1.0       0.92      0.87      0.90       572

    accuracy                           0.87       865
   macro avg       0.85      0.86      0.85       865
weighted avg       0.87      0.87      0.87       865

Accuracy Score: 0.8658959537572254
Classification Repor Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.71      0.97      0.82       293
         1.0       0.98      0.80      0.88       572

    accuracy                           0.86       865
   macro avg       0.85      0.88      0.85       865
weighted avg       0.89      0.86      0.86       865

Accuracy Score: 0.8566473988439306


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report Random Forest:
              precision    recall  f1-score   support

         0.0       0.73      0.85      0.79       293
         1.0       0.92      0.84      0.88       572

    accuracy                           0.85       865
   macro avg       0.83      0.85      0.83       865
weighted avg       0.85      0.85      0.85       865

Accuracy Score: 0.8450867052023121
Classification Report KNN:
              precision    recall  f1-score   support

         0.0       0.62      0.91      0.74       293
         1.0       0.94      0.72      0.81       572

    accuracy                           0.78       865
   macro avg       0.78      0.82      0.78       865
weighted avg       0.83      0.78      0.79       865

Accuracy Score: 0.7838150289017342
Classification Report Naive Bayes:
              precision    recall  f1-score   support

         0.0       0.52      0.73      0.61       293
         1.0       0.82      0.66      0.73       572

    accuracy 