In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

: 

In [None]:
# Implementing Logistic Regression from scratch
rice_dataset_raw = pd.read_csv(
    "https://download.mlcc.google.com/mledu-datasets/Rice_Cammeo_Osmancik.csv"
)

In [None]:
rice_dataset_raw.shape
rice_dataset_raw.head(10)


In [None]:
# Logistic Regression
# Maximising log loss
X = rice_dataset_raw.iloc[:, [0,1,2,3,4,5,6]].to_numpy()


rice_dataset_raw["Class"] = rice_dataset_raw["Class"].map({"Cammeo": 1, "Osmancik": 0})

Y = rice_dataset_raw.iloc[:, [7]].to_numpy()
rice_dataset_raw["Class"].unique()
rice_dataset_raw.corr()

In [None]:
features = ["Area", "Perimeter", "MajorAxisLength", "MinorAxisLength", "Eccentricity", "ConvexArea", "Extent"]

for i in range(len(X[0])):
    plt.figure()
    plt.hist(X[:, i][Y[:, 0] == 1], alpha=0.5, label="Cammeo")
    plt.hist(X[:, i][Y[:, 0] == 0], alpha=0.5, label="Osmancik")
    plt.title(features[i])
    plt.legend()
    plt.show()

In [None]:
# Create five 2D plots of the features against each other, color-coded by class.
for x_axis_data, y_axis_data in [
    ("Area", "Eccentricity"),
    ("Convex_Area", "Perimeter"),
    ("Major_Axis_Length", "Minor_Axis_Length"),
    ("Perimeter", "Extent"),
    ("Eccentricity", "Major_Axis_Length"),
]:
    plt.figure()
    plt.xlabel(x_axis_data)
    plt.ylabel(y_axis_data)
    plt.scatter(
        rice_dataset_raw[x_axis_data][rice_dataset_raw["Class"] == 1],
        rice_dataset_raw[y_axis_data][rice_dataset_raw["Class"] == 1],
        label="Cammeo",
        alpha=0.5,
    )
    plt.scatter(
        rice_dataset_raw[x_axis_data][rice_dataset_raw["Class"] == 0],
        rice_dataset_raw[y_axis_data][rice_dataset_raw["Class"] == 0],
        label="Osmancik",
        alpha=0.5,
    )
    plt.legend()
    plt.show()

In [None]:
# 3D plot bw diff metrics to pick best 3 features
# Eccentricity, Area, MajorAxisLength
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(
    rice_dataset_raw["Eccentricity"][rice_dataset_raw["Class"] == 1
    ],
    rice_dataset_raw["Area"][rice_dataset_raw["Class"] == 1],
    rice_dataset_raw["Major_Axis_Length"][rice_dataset_raw["Class"] == 1],                
)
ax.scatter(
    rice_dataset_raw["Eccentricity"][rice_dataset_raw["Class"] == 0
    ],
    rice_dataset_raw["Area"][rice_dataset_raw["Class"] == 0],
    rice_dataset_raw["Major_Axis_Length"][rice_dataset_raw["Class"] == 0],                
)
ax.set_xlabel('Eccentricity')
ax.set_ylabel('Area')
ax.set_zlabel('Major Axis Length')
plt.show()


In [None]:
# normalising these 3 features using z score without sklearn
X = rice_dataset_raw.iloc[:, [0, 2, 4]].to_numpy()
X = (X - X.mean(axis=0)) / X.std(axis=0)
Y = rice_dataset_raw.iloc[:, [7]].to_numpy()
print(X.shape, Y.shape)

In [None]:
# Shuffle the data and split into training , validation, and test sets
np.random.seed(42)
shuffled_indices = np.random.permutation(len(X))
X_shuffled = X[shuffled_indices]
Y_shuffled = Y[shuffled_indices]
train_size = int(0.7 * len(X))
val_size = int(0.15 * len(X))
X_train, Y_train = X_shuffled[:train_size], Y_shuffled[:train_size]
X_val, Y_val = X_shuffled[train_size:train_size + val_size], Y_shuffled[train_size:train_size + val_size]
X_test, Y_test = X_shuffled[train_size + val_size:], Y_shuffled[train_size + val_size:]
print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)
print(X_test.shape, Y_test.shape)

In [None]:
input_features = [
    "Eccentricity",
    "Major_Axis_Length",
    "Area",
]

In [None]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


# Prediction
def predict(X, w, b):
    z = np.dot(X, w) + b
    return sigmoid(z)

In [None]:
def train(X, y, lr=0.05, epochs=2000):
    n_samples, n_features = X.shape
    w = np.zeros((n_features, 1))
    b = 0

    y = y.reshape(-1, 1)

    for epoch in range(epochs):
        y_pred = sigmoid(np.dot(X, w) + b)
        dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
        db = (1 / n_samples) * np.sum(y_pred - y)

        w -= lr * dw
        b -= lr * db

        if epoch % 200 == 0:
            loss = -(1 / n_samples) * np.sum(
                y * np.log(y_pred + 1e-8) + (1 - y) * np.log(1 - y_pred + 1e-8)
            )
            print(f"Epoch {epoch}: Loss = {loss:.4f}")

    return w, b

In [None]:
w, b = train(X_train, Y_train, lr=0.05, epochs=2000)
print("Weights:", w.flatten())
print("Bias:", b)

In [None]:
# Validation using validationo data 
# Accuracy, Precision, Recall, F1 Score calculation
# Setting a variable threshold
threshold = 0.35
Y_val_pred_prob = predict(X_val, w, b)
Y_val_pred = (Y_val_pred_prob >= threshold).astype(int)
TP = np.sum((Y_val == 1) & (Y_val_pred == 1))
TN = np.sum((Y_val == 0) & (Y_val_pred == 0
))
FP = np.sum((Y_val == 0) & (Y_val_pred == 1 ))
FN = np.sum((Y_val == 1) & (Y_val_pred == 0))
accuracy = (TP + TN) / len(Y_val)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1 Score: {f1_score:.4f}")


In [None]:
# Evaluate the model on the test set
Y_test_pred_prob = predict(X_test, w, b)
Y_test_pred = (Y_test_pred_prob >= threshold).astype(int)
TP = np.sum((Y_test == 1) & (Y_test_pred == 1))
TN = np.sum((Y_test == 0) & (Y_test_pred == 0))
FP = np.sum((Y_test == 0) & (Y_test_pred == 1 ))
FN = np.sum((Y_test == 1) & (Y_test_pred == 0))
accuracy = (TP + TN) / len(Y_test)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1_score:.4f}")


In [None]:
# Calculating ROC and AUC without sklearn

def compute_roc_auc(y_true, y_scores):
    thresholds = np.linspace(0, 1, 100)
    tpr_list = []
    fpr_list = []

    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)

    for threshold in thresholds:
        y_pred = (y_scores >= threshold).astype(int)
        TP = np.sum((y_true == 1) & (y_pred == 1))
        TN = np.sum((y_true == 0) & (y_pred == 0))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))

        TPR = TP / P if P > 0 else 0
        FPR = FP / N if N > 0 else 0

        tpr_list.append(TPR)
        fpr_list.append(FPR)

    tpr_array = np.array(tpr_list)
    fpr_array = np.array(fpr_list)
    # Ensure FPR is increasing
    sorted_indices = np.argsort(fpr_array)
    fpr_array = fpr_array[sorted_indices]
    tpr_array = tpr_array[sorted_indices]

    auc = np.trapezoid(tpr_array, fpr_array)

    return fpr_array, tpr_array, auc
fpr, tpr, auc = compute_roc_auc(Y_test, Y_test_pred_prob)
print(f"AUC: {auc:.4f}")

# Plotting ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {auc
:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()


In [None]:
# Make a decision boundary

