In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
from plots import plot_data, plot_decision_boundary
from metrics import evaluate_classification

%matplotlib inline


KeyboardInterrupt: 

In [14]:
def load_dataset(filename, data_columns, target_column):
    """Load dataset from CSV file.

    Args:
        filename (str): Path to CSV file.
        data_columns (list): List of column names for data.
        target_column (str): Name of target column.

    Returns:
        tuple: Tuple containing data and target.
    """
    
    df = pd.read_csv(filename)
    X, y = df[data_columns], df[target_column]
    return X.values, y.values

In [15]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iter=1000, tol=1e-4, verbose=False):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
        self.theta = None
        self.mean = None
        self.std = None

    def normalize(self, X):
        self.mean = np.mean(X, axis=0)
        self.std = np.std(X, axis=0)
        return (X - self.mean) / self.std

    def add_intercept(self, X):
        return np.column_stack((np.ones(X.shape[0]), X))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def hypothesis(self, X, theta):
        return self.sigmoid(np.dot(X, theta))

    def cost_function(self, X, y, theta):
        m = len(y)
        h = self.hypothesis(X, theta)
        return (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

    def gradient(self, X, y, theta):
        m = len(y)
        return (1/m) * np.dot(X.T, (self.hypothesis(X, theta) - y))

    def gradient_descent(self, X, y):
        m, n = X.shape
        theta = np.zeros(n)
        costs = []

        for i in range(self.max_iter):
            theta -= self.learning_rate * self.gradient(X, y, theta)
            cost = self.cost_function(X, y, theta)
            costs.append(cost)
            if self.verbose and i % 100 == 0:
                print(f"Iteration {i}: Cost {cost}")
            if i > 0 and abs(costs[-1] - costs[-2]) < self.tol:
                break

        return theta, costs

    def fit(self, X, y):
        X = self.normalize(X)
        X = self.add_intercept(X)
        self.theta, self.costs = self.gradient_descent(X, y)

    def predict_proba(self, X):
        X = (X - self.mean) / self.std
        X = self.add_intercept(X)
        return self.hypothesis(X, self.theta)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [16]:
X, y = make_classification(n_samples = 200, n_classes = 2, n_features = 2, 
                           n_informative=2, n_redundant=0, random_state = 42,
                           flip_y=0.02, class_sep=0.8)
plot_data(X, y)

NameError: name 'plot_data' is not defined

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
lr = LogisticRegression(standardize=True, learning_rate=0.01, max_iter=1000, tol=1e-4, verbose=True)

TypeError: LogisticRegression.__init__() got an unexpected keyword argument 'standardize'

In [19]:
lr.fit(X_train, y_train)

Iteration 0: Cost 0.6921363335378605
Iteration 100: Cost 0.6130846701491595
Iteration 200: Cost 0.5643761426137045
Iteration 300: Cost 0.5328717909721297
Iteration 400: Cost 0.5114832641163253
Iteration 500: Cost 0.49634877623337864


In [20]:
y_test_pred = lr.predict(X_test)

In [21]:
accuracy, report, confusion = evaluate_classification(y_test, y_test_pred)
print(f"Accuracy: \n{accuracy}")
print(f"Report: \n{report}")
print(f"Confusion: \n{confusion}")
plot_decision_boundary(lr, X_test, y_test)
plt.show()

NameError: name 'evaluate_classification' is not defined

In [22]:
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
plot_data(X, y)

NameError: name 'plot_data' is not defined

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LogisticRegression(standardize=True, learning_rate=0.01, max_iter=1000, tol=1e-4, verbose=True)
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)

accuracy, report, confusion = evaluate_classification(y_test, y_test_pred)
print(f"Accuracy: \n{accuracy}")
print(f"Report: \n{report}")
print(f"Confusion: \n{confusion}")
plot_decision_boundary(lr, X_test, y_test)
plt.show()

NameError: name 'train_test_split' is not defined

Evaluate the above LogisticRegression class on datasets sats.csv and tests.csv. Consider using polynomial features when applicable.

In [None]:
data_columns = ["exam1", "exam2"]
target_column = "submitted"
X, y = load_dataset('../../Data/Classification/sats.csv', data_columns, target_column)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
lr = LogisticRegression(standardize=True, learning_rate=0.01, max_iter=1000, tol=1e-4, verbose=True)
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)

accuracy, report, confusion = evaluate_classification(y_test, y_test_pred)
print(f"Accuracy: \n{accuracy}")
print(f"Report: \n{report}")
print(f"Confusion: \n{confusion}")
plot_decision_boundary(lr, X_test, y_test)
plt.show()