<a href="https://colab.research.google.com/github/KeerthanaSistla/ML/blob/main/MLAssignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install packages if not already installed
# !pip install imbalanced-learn scikit-learn pandas numpy

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.datasets import make_classification

# ================================
# Function to calculate performance metrics
# ================================
def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()

    acc_plus = recall_score(y_true, y_pred)  # minority class recall
    acc_minus = TN / (TN + FP)               # majority class accuracy
    precision = precision_score(y_true, y_pred)
    f_measure = f1_score(y_true, y_pred)
    g_mean = np.sqrt(acc_plus * acc_minus)
    wt_accuracy = (acc_plus + acc_minus) / 2

    return {
        "Acc+": round(acc_plus*100,2),
        "Acc-": round(acc_minus*100,2),
        "Precision": round(precision*100,2),
        "F-measure": round(f_measure*100,2),
        "G-mean": round(g_mean*100,2),
        "Wt. Accuracy": round(wt_accuracy*100,2)
    }

# ================================
# Dataset configuration
# ================================
datasets_info = [
    {"name": "Oil", "samples": 937, "features": 50, "minority_ratio": 41/937},
    {"name": "Mammography", "samples": 11183, "features": 6, "minority_ratio": 260/11183},
    {"name": "Satimage", "samples": 6435, "features": 36, "minority_ratio": 0.097},
    {"name": "Hypothyroid", "samples": 2520, "features": 24, "minority_ratio": 120/2520},
    {"name": "Euthyroid", "samples": 2640, "features": 24, "minority_ratio": 240/2640},
    {"name": "KDD Thrombin", "samples": 2543, "features": 100, "minority_ratio": 190/2543},
]

# ================================
# Loop over each dataset
# ================================
for info in datasets_info:
    print(f"\n=== Dataset: {info['name']} ===")

    # Generate synthetic imbalanced dataset for demonstration
    X, y = make_classification(n_samples=info["samples"],
                               n_features=info["features"],
                               n_informative=int(info["features"]/3),
                               n_redundant=int(info["features"]/6),
                               n_classes=2,
                               weights=[1-info["minority_ratio"], info["minority_ratio"]],
                               flip_y=0,
                               random_state=42)

    # Apply oversampling
    smote = SMOTE(sampling_strategy=0.5, random_state=42)  # generate minority to 50% of majority
    X_res, y_res = smote.fit_resample(X, y)

    # Weighted Random Forest
    majority_count = sum(y_res==0)
    minority_count = sum(y_res==1)
    class_weights = {0:1, 1:majority_count/minority_count}

    wrf = RandomForestClassifier(n_estimators=200, class_weight=class_weights, random_state=42)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, stratify=y_res, test_size=0.3, random_state=42)
    wrf.fit(X_train, y_train)

    # Predict probabilities
    y_prob = wrf.predict_proba(X_test)[:,1]

    # Threshold tuning to maximize G-mean
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_gmean = 0
    best_threshold = 0.5
    best_pred = None

    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        metrics = evaluate_model(y_test, y_pred)
        if metrics["G-mean"] > best_gmean:
            best_gmean = metrics["G-mean"]
            best_threshold = t
            best_pred = y_pred

    # Final metrics
    final_metrics = evaluate_model(y_test, best_pred)
    print(f"Best Threshold: {best_threshold}")
    print("Performance Metrics:")
    for k,v in final_metrics.items():
        print(f"{k}: {v}")


=== Dataset: Oil ===
Best Threshold: 0.3500000000000001
Performance Metrics:
Acc+: 100.0
Acc-: 99.63
Precision: 99.26
F-measure: 99.63
G-mean: 99.81
Wt. Accuracy: 99.81

=== Dataset: Mammography ===
Best Threshold: 0.40000000000000013
Performance Metrics:
Acc+: 97.93
Acc-: 97.22
Precision: 94.63
F-measure: 96.25
G-mean: 97.57
Wt. Accuracy: 97.57

=== Dataset: Satimage ===
Best Threshold: 0.45000000000000007
Performance Metrics:
Acc+: 95.87
Acc-: 98.51
Precision: 96.98
F-measure: 96.42
G-mean: 97.18
Wt. Accuracy: 97.19

=== Dataset: Hypothyroid ===
Best Threshold: 0.40000000000000013
Performance Metrics:
Acc+: 100.0
Acc-: 98.89
Precision: 97.83
F-measure: 98.9
G-mean: 99.44
Wt. Accuracy: 99.44

=== Dataset: Euthyroid ===
Best Threshold: 0.40000000000000013
Performance Metrics:
Acc+: 95.28
Acc-: 97.22
Precision: 94.49
F-measure: 94.88
G-mean: 96.25
Wt. Accuracy: 96.25

=== Dataset: KDD Thrombin ===
Best Threshold: 0.40000000000000013
Performance Metrics:
Acc+: 98.87
Acc-: 99.43
Precisio

In [11]:
# Install packages if not already installed
# !pip install imbalanced-learn scikit-learn pandas numpy

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# ================================
# Function to calculate performance metrics
# ================================
def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()

    acc_plus = recall_score(y_true, y_pred)  # minority class recall
    acc_minus = TN / (TN + FP)               # majority class accuracy
    precision = precision_score(y_true, y_pred)
    f_measure = f1_score(y_true, y_pred)
    g_mean = np.sqrt(acc_plus * acc_minus)
    wt_accuracy = (acc_plus + acc_minus) / 2

    return {
        "Acc+": round(acc_plus*100,2),
        "Acc-": round(acc_minus*100,2),
        "Precision": round(precision*100,2),
        "F-measure": round(f_measure*100,2),
        "G-mean": round(g_mean*100,2),
        "Wt. Accuracy": round(wt_accuracy*100,2)
    }

# ================================
# Load Mammographic Masses dataset
# ================================
# Column names from the .names file
cols = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']

df = pd.read_csv('/content/mammographic_masses.data', names=cols, na_values='?')

# Drop rows with missing values
df = df.dropna()

# Features and target
X = df[['Age', 'Shape', 'Margin', 'Density']]
y = df['Severity']

# Ensure target is binary int
y = y.astype(int)

# Check class distribution first
print("Original class counts:")
print(y.value_counts())

# ================================
# Apply SMOTE for balancing
# ================================
# Generate minority samples until classes are balanced
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("Resampled class counts:")
print(pd.Series(y_res).value_counts())


# ================================
# Weighted Random Forest
# ================================
majority_count = sum(y_res==0)
minority_count = sum(y_res==1)
class_weights = {0:1, 1:majority_count/minority_count}

wrf = RandomForestClassifier(n_estimators=200, class_weight=class_weights, random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, stratify=y_res, test_size=0.3, random_state=42
)
wrf.fit(X_train, y_train)

# Predict probabilities
y_prob = wrf.predict_proba(X_test)[:,1]

# Threshold tuning to maximize G-mean
thresholds = np.arange(0.1, 0.9, 0.05)
best_gmean = 0
best_threshold = 0.5
best_pred = None

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    metrics = evaluate_model(y_test, y_pred)
    if metrics["G-mean"] > best_gmean:
        best_gmean = metrics["G-mean"]
        best_threshold = t
        best_pred = y_pred

# Final metrics
final_metrics = evaluate_model(y_test, best_pred)
print(f"Best Threshold: {best_threshold}")
print("Performance Metrics:")
for k,v in final_metrics.items():
    print(f"{k}: {v}")

Original class counts:
Severity
0    427
1    403
Name: count, dtype: int64
Resampled class counts:
Severity
1    427
0    427
Name: count, dtype: int64
Best Threshold: 0.5000000000000001
Performance Metrics:
Acc+: 83.59
Acc-: 79.07
Precision: 79.85
F-measure: 81.68
G-mean: 81.3
Wt. Accuracy: 81.33


#Try

In [12]:
import pandas as pd
import numpy as np

# Load data
data_path = '/content/mammographic_masses.data'
columns = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
df = pd.read_csv(data_path, names=columns, na_values='?')

# Check the first few rows
print(df.head())

# Check missing values
print(df.isna().sum())


   BI-RADS   Age  Shape  Margin  Density  Severity
0      5.0  67.0    3.0     5.0      3.0         1
1      4.0  43.0    1.0     1.0      NaN         1
2      5.0  58.0    4.0     5.0      3.0         1
3      4.0  28.0    1.0     1.0      3.0         0
4      5.0  74.0    1.0     5.0      NaN         1
BI-RADS      2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64


In [13]:
from sklearn.preprocessing import LabelEncoder

# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# Encode categorical features
cat_cols = ['BI-RADS', 'Shape', 'Margin', 'Density']
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop('Severity', axis=1)
y = df['Severity']
print(X.head())
print(y.value_counts())


   BI-RADS   Age  Shape  Margin  Density
0        4  67.0      2       4        2
1        3  43.0      0       0        2
2        4  58.0      3       4        2
3        3  28.0      0       0        2
4        4  74.0      0       4        2
Severity
0    516
1    445
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [21]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)  # balance to equal classes
X_res, y_res = smote.fit_resample(X, y)

# Check new counts
print(pd.Series(y_res).value_counts())


Severity
1    516
0    516
Name: count, dtype: int64


In [22]:
# Compute current ratio
minority_count = sum(y==1)
majority_count = sum(y==0)
current_ratio = minority_count / majority_count
print("Current minority/majority ratio:", current_ratio)

# Set slightly higher ratio
smote = SMOTE(sampling_strategy=max(1.0, current_ratio), random_state=42)
X_res, y_res = smote.fit_resample(X, y)


Current minority/majority ratio: 0.8624031007751938


In [17]:
print(y.value_counts())


Severity
0    516
1    445
Name: count, dtype: int64


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, stratify=y_res, test_size=0.3, random_state=42
)


In [24]:
from sklearn.ensemble import RandomForestClassifier

majority_count = sum(y_res==0)
minority_count = sum(y_res==1)
class_weights = {0:1, 1:majority_count/minority_count}

wrf = RandomForestClassifier(
    n_estimators=300,
    class_weight=class_weights,
    random_state=42,
    max_depth=6
)
wrf.fit(X_train, y_train)


In [25]:
from sklearn.metrics import confusion_matrix, recall_score, f1_score, precision_score

def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    acc_plus = recall_score(y_true, y_pred)
    acc_minus = TN / (TN + FP)
    precision = precision_score(y_true, y_pred)
    f_measure = f1_score(y_true, y_pred)
    g_mean = np.sqrt(acc_plus * acc_minus)
    wt_accuracy = (acc_plus + acc_minus) / 2
    return {
        "Acc+": round(acc_plus*100,2),
        "Acc-": round(acc_minus*100,2),
        "Precision": round(precision*100,2),
        "F-measure": round(f_measure*100,2),
        "G-mean": round(g_mean*100,2),
        "Wt. Accuracy": round(wt_accuracy*100,2)
    }

y_prob = wrf.predict_proba(X_test)[:,1]
thresholds = np.arange(0.1, 0.9, 0.01)

best_gmean = 0
best_threshold = 0.5
best_pred = None

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    metrics = evaluate_model(y_test, y_pred)
    if metrics["G-mean"] > best_gmean:
        best_gmean = metrics["G-mean"]
        best_threshold = t
        best_pred = y_pred

final_metrics = evaluate_model(y_test, best_pred)
print(f"Best Threshold: {best_threshold}")
print("Performance Metrics:")
for k,v in final_metrics.items():
    print(f"{k}: {v}")


Best Threshold: 0.5099999999999998
Performance Metrics:
Acc+: 86.45
Acc-: 77.42
Precision: 79.29
F-measure: 82.72
G-mean: 81.81
Wt. Accuracy: 81.94
