In [38]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [39]:
#Load the Data
data, meta = arff.loadarff(r'D:\ML\1year.arff')
df = pd.DataFrame(data)   # Convert the data into a pandas DataFrame

In [40]:
# Inspect Dataset
print("Column names in the dataset:")
print(df.columns)

print("Distribution of the target column:")
print(df['Attr64'].value_counts())

# Check for missing values
missing_data = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_data)

Column names in the dataset:
Index(['Attr1', 'Attr2', 'Attr3', 'Attr4', 'Attr5', 'Attr6', 'Attr7', 'Attr8',
       'Attr9', 'Attr10', 'Attr11', 'Attr12', 'Attr13', 'Attr14', 'Attr15',
       'Attr16', 'Attr17', 'Attr18', 'Attr19', 'Attr20', 'Attr21', 'Attr22',
       'Attr23', 'Attr24', 'Attr25', 'Attr26', 'Attr27', 'Attr28', 'Attr29',
       'Attr30', 'Attr31', 'Attr32', 'Attr33', 'Attr34', 'Attr35', 'Attr36',
       'Attr37', 'Attr38', 'Attr39', 'Attr40', 'Attr41', 'Attr42', 'Attr43',
       'Attr44', 'Attr45', 'Attr46', 'Attr47', 'Attr48', 'Attr49', 'Attr50',
       'Attr51', 'Attr52', 'Attr53', 'Attr54', 'Attr55', 'Attr56', 'Attr57',
       'Attr58', 'Attr59', 'Attr60', 'Attr61', 'Attr62', 'Attr63', 'Attr64',
       'class'],
      dtype='object')
Distribution of the target column:
Attr64
5.35460      3
2.13970      3
11.72300     3
1.56960      3
1.92890      3
            ..
3.62500      1
1.75720      1
62.00100     1
0.51005      1
351.85000    1
Name: count, Length: 6639, dtyp

In [41]:

# Impute missing values 
df.fillna(df.mean(), inplace=True)

# One-hot encode categorical columns 
df = pd.get_dummies(df)

# Encode labels 
label_encoder = LabelEncoder()
df['Attr1'] = label_encoder.fit_transform(df['Attr1'])  

# Standardize numerical features
scaler = StandardScaler()
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


In [42]:
#Prepare the Data for Model Training
X = df.drop('Attr64', axis=1)  # Drop the target column 
y = df['Attr64']  # Target column

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Support Vector Machine (SVM)': SVC(probability=True, class_weight='balanced', random_state=42)
}

In [44]:

# Evaluate the Models
# Ensure the target is binary
threshold = 0.5  
y_train = (y_train > threshold).astype(int)
y_test = (y_test > threshold).astype(int)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    # Classification report 
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")
    
    # Confusion Matrix
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    
    # ROC-AUC score 
    print(f"{name} ROC-AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]):.4f}")
    print("-" * 50)
    

Random Forest Accuracy: 0.9993
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1402
           1       1.00      0.75      0.86         4

    accuracy                           1.00      1406
   macro avg       1.00      0.88      0.93      1406
weighted avg       1.00      1.00      1.00      1406

Random Forest Confusion Matrix:
[[1402    0]
 [   1    3]]
Random Forest ROC-AUC: 0.9993
--------------------------------------------------
Logistic Regression Accuracy: 0.9922
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1402
           1       0.27      1.00      0.42         4

    accuracy                           0.99      1406
   macro avg       0.63      1.00      0.71      1406
weighted avg       1.00      0.99      0.99      1406

Logistic Regression Confusion Matrix:
[[1391   11]
 [   0 

In [45]:
# Hyperparameter grids for tuning
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],  
        'solver': ['liblinear', 'saga']
    },
    'Support Vector Machine (SVM)': {  
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

In [46]:
# GridSearch for each model
for name, model in models.items():
    print(f"\nTuning {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    # Evaluate the best model from GridSearchCV
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Accuracy and other evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best {name} Accuracy: {accuracy:.4f}")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")
    print(f"{name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"{name} ROC-AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]):.4f}")
    print("-" * 50)


Tuning Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Random Forest Accuracy: 0.9993
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1402
           1       1.00      0.75      0.86         4

    accuracy                           1.00      1406
   macro avg       1.00      0.88      0.93      1406
weighted avg       1.00      1.00      1.00      1406

Random Forest Confusion Matrix:
[[1402    0]
 [   1    3]]
Random Forest ROC-AUC: 0.9996
--------------------------------------------------

Tuning Logistic Regression...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Best Logistic Regression Accuracy: 0.9922
Logistic Regression Classification Report:
              precision 