In [1]:
from model import LogisticRegression
from dataloader import Dataloader
import numpy as np

In [2]:
expl_columns = ['Best Hand', 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts',
				'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 
				'Potions', 'Care of Magical Creatures', 'Charms', 'Flying']
target_col = 'Hogwarts House'

filename = "../datasets/dataset_train.csv"

datas = Dataloader(filename=filename, 
			expl_columns=expl_columns, 
			target_col=target_col,
			isIndex=True)


In [3]:
print(len(datas._dataframe.columns), len(datas._classes))
model = LogisticRegression(len(datas._dataframe.columns), len(datas._classes))

14 4


In [4]:
print(model.weights.shape)
print(model.bias.shape)

(4, 14)
(4,)


In [5]:
print(model.bias.shape)
print(model.weights.shape)

(4,)
(4, 14)


In [6]:
X = np.array(datas._dataframe)

In [7]:
X.shape

(1600, 14)

In [8]:
def _sigmoid(x):
		return np.array([_sigmoid_function(value) for value in x])

def _sigmoid_function(x):
	def exp_trick(x):
		if x >= 0:
			z = np.exp(-x)
			return 1 / (1 + z)
		else:
			z = np.exp(x)
			return z / (1 + z)
	if isinstance(x, np.ndarray):
		return [exp_trick(val) for val in x]
	else:
		return exp_trick(x)

In [9]:
res = np.matmul(X, model.weights.T) + model.bias
res.shape


(1600, 4)

In [10]:
activated = _sigmoid(res)
print(activated.shape)

(1600, 4)


In [11]:
target_matrix = np.array(datas._target_matrix)

In [12]:
target_matrix.shape

(1600, 4)

In [13]:
def compute_loss(y_true, y_pred):
	# binary cross entropy
	y_zero_loss = y_true * np.log(y_pred + 1e-9)
	y_one_loss = (1-y_true) * np.log(1 - y_pred + 1e-9)
	print('DEBUG:', y_one_loss.shape, y_zero_loss.shape)
	return -np.mean(y_zero_loss + y_one_loss, axis=0)

In [14]:
loss = compute_loss(y_true=target_matrix, y_pred=activated)

DEBUG: (1600, 4) (1600, 4)


In [15]:
difference = (target_matrix - activated)
np.mean(difference, axis=0)

array([-0.223125, -0.311875, -0.295625, -0.169375])

In [16]:
difference.shape

(1600, 4)

In [17]:
def compute_gradients(x, y_true, y_pred):
	# derivative of binary cross entropy
	difference =  y_pred - y_true
	gradient_b = np.mean(difference, axis=0)
	gradients_w = np.matmul(x.T, difference)
	print('DEBUG:', gradients_w.shape)
	gradients_w = np.array([grad for grad in gradients_w])

	return gradients_w, gradient_b

In [18]:
y_true = target_matrix[:, 0]
y_true = y_true.reshape(y_true.shape[0], 1)
y_true.shape

y_pred = activated[:, 0]
y_pred = y_pred.reshape(y_pred.shape[0], 1)
y_pred.shape

grad_w, grad_b = compute_gradients(X, y_true, y_pred)

DEBUG: (14, 1)


In [19]:
grad_w, grad_b = compute_gradients(X, target_matrix, activated)

print(grad_w.shape, grad_b.shape)


DEBUG: (14, 4)
(14, 4) (4,)


In [20]:
from sklearn.metrics import accuracy_score

def sanitize_pred(x):
	if x > 0.5: 
		return 1
	return 0

train_accuracies = []
pred_to_class = np.vectorize(sanitize_pred)(activated)
pred_to_class.shape

(1600, 4)

In [21]:
def z_normalize(data):
	mean = np.mean(data, axis=0)
	std = np.std(data, axis=0, ddof=1)  # Use ddof=1 for sample standard deviation
	normalized_data = (data - mean) / std
	
	return normalized_data, mean, std

In [22]:
from dataloader import Dataloader
from model import LogisticRegression
import numpy as np

expl_columns = ['Best Hand', 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts',
				'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 
				'Potions', 'Care of Magical Creatures', 'Charms', 'Flying']
target_col = 'Hogwarts House'

filename = "../datasets/dataset_train.csv"

datas = Dataloader(filename=filename, 
			expl_columns=expl_columns, 
			target_col=target_col,
			isIndex=True)

model = LogisticRegression(len(datas._dataframe.columns), len(datas._classes))

In [23]:
X = np.array(datas._dataframe)
print(X.shape)
X, mean, std = z_normalize(X)
print(X.shape)
y = np.array(datas._target_matrix)
class_name = datas._classes
print(mean.shape, std.shape)
norm_param = np.vstack((mean, std))
print(norm_param.shape)

(1600, 14)
(1600, 14)
(14,) (14,)
(2, 14)


In [24]:
model.fit(X, y, 200, class_name)


LOG: epoch 20/200
Losses:
Ravenclaw : 0.2506286590507576
Slytherin : 0.3002657574710931
Gryffindor : 0.25350403473521516
Hufflepuff : 0.2894594465078668
LOG: epoch 40/200
Losses:
Ravenclaw : 0.1663644448288321
Slytherin : 0.19592480099367124
Gryffindor : 0.16591294692364467
Hufflepuff : 0.18608489264505554
LOG: epoch 60/200
Losses:
Ravenclaw : 0.13323780779183775
Slytherin : 0.1501432333728898
Gryffindor : 0.12879812261020157
Hufflepuff : 0.1438004664320321
LOG: epoch 80/200
Losses:
Ravenclaw : 0.11592916752157531
Slytherin : 0.12473233137737699
Gryffindor : 0.10848420715061735
Hufflepuff : 0.12130069386950465
LOG: epoch 100/200
Losses:
Ravenclaw : 0.10542906235043142
Slytherin : 0.10868757431754038
Gryffindor : 0.0957518903716725
Hufflepuff : 0.10749101593084086
LOG: epoch 120/200
Losses:
Ravenclaw : 0.09843730064428356
Slytherin : 0.09769302714420551
Gryffindor : 0.08706040044909281
Hufflepuff : 0.09822364095991648
LOG: epoch 140/200
Losses:
Ravenclaw : 0.09347345705342802
Slytherin 

In [25]:
import plotly.graph_objects as go

def plot_losses(loss_dicts):
	'''
	loss_dicts is of type:
		[{
		"Ravenclaw" : float,
		"Slytherin" : float,
		"Gryffindor" : float,
		"Hufflepuff" : float
		}, ...]
	'''
	epochs = list(range(1, len(loss_dicts) + 1))  # X-axis (number of epochs)
	loss_keys = loss_dicts[0].keys()  # Extract loss names from the first dictionary
	
	# Initialize figure
	fig = go.Figure()
	
	# Add each loss as a separate line
	for key in loss_keys:
		loss_values = [d[key] for d in loss_dicts]  # Extract loss values for each epoch
		fig.add_trace(go.Scatter(x=epochs, y=loss_values, mode='lines', name=key))
	
	# Customize layout
	fig.update_layout(
		title='Loss Values Over Epochs',
		xaxis_title='Epochs',
		yaxis_title='Loss Value',
		template='plotly_dark',
		legend_title='Loss Types'
	)
	return fig

def plot_accuracy(acc_dicts):
	"""
	acc_dict is of type:
		'''
		[{
		"Ravenclaw" : float,
		"Slytherin" : float,
		"Gryffindor" : float,
		"Hufflepuff" : float
		}, ...]
	'''
	"""
	epochs = list(range(1, len(acc_dicts) + 1))  # X-axis (number of epochs)
	acc_keys = acc_dicts[0].keys()  # Extract accuracy keys

	# Initialize figure
	fig = go.Figure()

	# Add each accuracy score as a separate line
	for key in acc_keys:
		acc_values = [d[key] for d in acc_dicts]  # Extract accuracy values for each epoch
		fig.add_trace(go.Scatter(x=epochs, y=acc_values, mode='lines+markers', name=key))

	# Customize layout
	fig.update_layout(
		title='Accuracy Score Over Epochs',
		xaxis_title='Epochs',
		yaxis_title='Accuracy Score',
		template='plotly_dark',
		legend_title='Accuracy Metrics'
	)
	return fig

In [26]:
ret = plot_losses(model.losses)
ret

In [27]:
ret.write_html("../viz/losses_per_houses.html")

In [28]:
ret = plot_accuracy(model.train_accuracies)
ret

In [29]:
ret.write_html("../viz/losses_per_houses.html")


In [30]:
X = X[0:5,:]

print(X.shape)

(5, 14)


In [31]:
ret = model.predict(X)

In [32]:
(ret.shape)

(5, 4)

In [33]:


# import libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
 
# load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
 
# split the data into training and testing sets
X_train, X_test,\
y_train, y_test = train_test_split(X, y,
                                   test_size=0.2,
                                   random_state=42)

In [34]:
print(iris.feature_names)
print(iris.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [35]:
X
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [36]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [37]:
from dataloader import Dataloader
import pandas as pd


def z_normalize(data):
	mean = np.mean(data, axis=0)
	std = np.std(data, axis=0, ddof=1)  # Use ddof=1 for sample standard deviation
	normalized_data = (data - mean) / std
	
	return normalized_data, mean, std

expl_columns = ['Best Hand', 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts',
				'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 
				'Potions', 'Care of Magical Creatures', 'Charms', 'Flying']
target_col = 'Hogwarts House'

datas = Dataloader(filename="../datasets/dataset_train.csv", 
			expl_columns=expl_columns, 
			target_col=target_col,
			isIndex=True)

datas_test = Dataloader(filename="../datasets/dataset_test.csv", 
			expl_columns=expl_columns, 
			target_col=None,
			isIndex=True)


In [38]:
X = np.array(datas._dataframe)
X, means, stds = z_normalize(X)

X_test = np.array(datas_test._dataframe)
X_test = (X_test - means) / stds

In [39]:
from sklearn.preprocessing import LabelEncoder
test = pd.DataFrame({"house": datas.classes_original})
y = test["house"]
print(y)
encoder = LabelEncoder()
y = encoder.fit_transform(y)  

Index
0        Ravenclaw
1        Slytherin
2        Ravenclaw
3       Gryffindor
4       Gryffindor
           ...    
1595    Gryffindor
1596     Slytherin
1597    Gryffindor
1598    Hufflepuff
1599    Hufflepuff
Name: house, Length: 1600, dtype: object


In [40]:
test["lalala"] = list(y)

In [41]:
test

Unnamed: 0_level_0,house,lalala
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Ravenclaw,2
1,Slytherin,3
2,Ravenclaw,2
3,Gryffindor,0
4,Gryffindor,0
...,...,...
1595,Gryffindor,0
1596,Slytherin,3
1597,Gryffindor,0
1598,Hufflepuff,1


In [42]:
ovr_logreg = LogisticRegression(multi_class='ovr',
								solver='liblinear')

class_name = {0:"Gryffindor", 1:"Hufflepuff", 2:"Ravenclaw", 3:"Slytherin"}

In [43]:
ovr_logreg.fit(X, y)
y_pred_ovr = ovr_logreg.predict(X_test)

In [44]:
y_pred_ovr.shape

(400,)

In [45]:

def predictions_to_dataframe(predictions, class_dict):
	df = pd.DataFrame({'label': predictions})  
	df['name'] = df['label'].map(class_dict) 
	return df

predictions_to_dataframe(y_pred_ovr, class_name)

Unnamed: 0,label,name
0,1,Hufflepuff
1,2,Ravenclaw
2,0,Gryffindor
3,1,Hufflepuff
4,1,Hufflepuff
...,...,...
395,3,Slytherin
396,1,Hufflepuff
397,1,Hufflepuff
398,2,Ravenclaw
