<a href="https://colab.research.google.com/github/MIKBRUNO/TPNS/blob/main/lab2/custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://education.yandex.ru/handbook/ml/article/pervoe-znakomstvo-s-polnosvyaznymi-nejrosetyami
# https://education.yandex.ru/handbook/ml/article/metriki-klassifikacii-i-regressii
# https://archive.ics.uci.edu/dataset/73/mushroom

In [None]:
import pandas as pd

In [None]:
from math import log2
def entropy(feature: pd.Series) -> float:
  classes = feature.value_counts()
  count = len(feature)
  probability = [class_count / count for class_count in classes if not (class_count / count) < 0.001]

  return sum([-p * log2(p) for p in probability])

In [None]:
def information_gain(data: pd.DataFrame, target: str, feature: str) -> float:
  gain = entropy(data[target])
  for unique in data[feature].unique():
    subset = data[data[feature] == unique][target]
    gain -= entropy(subset) * (len(subset)/len(data[target]))
  return gain

In [None]:
from math import log2
def gain_ratio(data: pd.DataFrame, target: str, feature: str) -> float:
  intr_info = 0
  for unique in data[feature].unique():
    subset = data[data[feature] == unique][target]
    weight = len(subset) / len(data[target])
    intr_info -= weight * log2(weight)
  return information_gain(data, target, feature) / intr_info

## MyModel

In [None]:
import numpy as np

In [None]:
mytype = np.float32

In [None]:
def reluf(x):
  return np.max([0, x])

def relu(x):
  return np.max(np.array([np.zeros(x.shape, dtype=x.dtype), x]), axis=0)

def linear(x: mytype) -> mytype:
  return x

def sigmoid(x: mytype) -> mytype:
  return 1 / (1 + np.exp(-x))

def mse(y_pred, y):
  return np.mean(np.power(y - y_pred, 2))

In [None]:
class Differentiable:
  def __init__(self, f, df):
    self.__f__ = f
    self.__df__ = df

  def f(self, *arg):
    return self.__f__(*arg)

  def df(self, *arg):
    return self.__df__(*arg)

drelu = Differentiable(relu, lambda x: np.array([1 if xx >= 0 else 0 for xx in x]))
dsigmoid = Differentiable(sigmoid, lambda x: sigmoid(x)*(1-sigmoid(x)))
dlinear = Differentiable(linear, lambda x: np.ones(x.shape))

dmse = Differentiable(mse, lambda y_pred, y: 2*(y_pred - y))

In [None]:
class MyNeuron:
  def __init__(self, input_dim: int, activation=reluf):
    self.activation = activation
    self.weights = None
    self.weights = np.random.rand(input_dim)
    self.bias = np.random.rand(1)[0]

  def get_weights(self) -> np.array:
    return self.weights

  def get_bias(self) -> np.array:
    return self.bias

  def feed(self, vec: np.array) -> mytype:
    return self.activation(np.dot(vec, self.weights) + self.bias)


In [None]:
class MyLayer:
  def __init__(self, input_dim: int, k: int, activation=drelu):
    self.activation = activation
    self.neurons = []
    for i in range(k):
      self.neurons.append(MyNeuron(input_dim))

  def get_weights(self) -> np.array:
    return np.array([n.get_weights() for n in self.neurons])

  def get_bias(self) -> np.array:
    return np.array([n.get_bias() for n in self.neurons])

  def set_weights(self, nw: np.array):
    for n, w in zip(self.neurons, nw):
      n.weights = w

  def set_bias(self, nb: np.array):
    for n, b in zip(self.neurons, nb):
      n.bias = b

  def feed(self, vec: np.array, return_z=False):
    weights = self.get_weights()
    bias = self.get_bias()
    z = np.matmul(weights, vec) + bias
    if return_z:
      return (self.activation.f(z), z)
    return self.activation.f(z)


In [None]:
class MyModel:
  def __init__(self, layers: list):
    self.layers = layers

  def feed(self, vec: np.array) -> np.array:
    for l in self.layers:
      vec = l.feed(vec)
    return vec

  def eval(self, x, y, loss=dmse) -> dict:
    l = 0
    for a, b in zip(x,y):
      a = self.feed(a)
      l += loss.f(a, b)
    return {'loss': l / len(x)}

  def backprop(self, x: np.array, y: np.array, loss=dmse) -> dict:
    layer_values = []
    z_values = []
    for l in self.layers:
      layer_values.append(x)
      x, z = l.feed(x, return_z=True)
      z_values.append(z)
    y_pred = x
    cost = loss.f(y_pred, y)
    nabla_cost = loss.df(y_pred, y)

    res = {}
    res['bias_costs'] = []
    res['weights_costs'] = []
    layer_cost = nabla_cost
    for l, lm1, z in zip(reversed(self.layers), reversed(layer_values), reversed(z_values)):
      z_cost = l.activation.df(z) * layer_cost
      res['bias_costs'].insert(0, z_cost)
      res['weights_costs'].insert(0, np.outer(z_cost, lm1))
      layer_cost = np.matmul(np.transpose(l.get_weights()), z_cost)

    return res

  def update_batch(self, batch, learning_rate):
    nabla_b = [np.zeros(l.get_bias().shape) for l in self.layers]
    nabla_w = [np.zeros(l.get_weights().shape) for l in self.layers]
    for x, y in batch:
      history = self.backprop(x, y)
      delta_nabla_b = history['bias_costs']
      delta_nabla_w = history['weights_costs']
      nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
      nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

    for l, nw, nb in zip(self.layers, nabla_w, nabla_b):
      l.set_weights(l.get_weights() - (learning_rate / len(batch)) * nw)
      l.set_bias(l.get_bias() - (learning_rate / len(batch)) * nb)

  def fit(self, x, y, epochs, batch_size, learning_rate=0.001, loss=dmse):
    train_data = list(zip(x, y))
    n = len(train_data)
    for j in range(epochs):
      np.random.shuffle(train_data)
      batches = [
          train_data[k : k + batch_size]
          for k in range(0, n, batch_size)
      ]
      for batch in batches:
        self.update_batch(batch, learning_rate)
      print(f"Epoch {j} complete! Loss: {self.eval(x, y, loss=loss)}")


In [None]:
def accuracy(Y_pred, Y):
  TP = sum([1 if y_pred > .5 and y > .5 else 0 for y_pred, y in zip(Y_pred, Y)])
  TN = sum([1 if y_pred < .5 and y < .5 else 0 for y_pred, y in zip(Y_pred, Y)])
  return (TP + TN) / len(Y)

def precision(Y_pred, Y):
  TP = sum([1 if y_pred > .5 and y > .5 else 0 for y_pred, y in zip(Y_pred, Y)])
  return TP / sum([1 if y_pred > .5 else 0 for y_pred in Y_pred])

def recall(Y_pred, Y):
  TP = sum([1 if y_pred > .5 and y > .5 else 0 for y_pred, y in zip(Y_pred, Y)])
  return TP / sum([1 if y > .5 else 0 for y in Y])

# Mushrooms

## Dataset

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import pandas as pd

In [None]:
from ucimlrepo import fetch_ucirepo

mushroom = fetch_ucirepo(id=73)

features = mushroom.data.features
target = mushroom.data.targets

In [None]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   cap-shape                 8124 non-null   object
 1   cap-surface               8124 non-null   object
 2   cap-color                 8124 non-null   object
 3   bruises                   8124 non-null   object
 4   odor                      8124 non-null   object
 5   gill-attachment           8124 non-null   object
 6   gill-spacing              8124 non-null   object
 7   gill-size                 8124 non-null   object
 8   gill-color                8124 non-null   object
 9   stalk-shape               8124 non-null   object
 10  stalk-root                5644 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

In [None]:
target.describe()

Unnamed: 0,poisonous
count,8124
unique,2
top,e
freq,4208


In [None]:
features.describe()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,6,4,10,2,9,2,2,2,12,2,...,4,9,9,1,4,3,5,9,6,7
top,x,y,n,f,n,f,c,b,b,t,...,s,w,w,p,w,o,p,w,v,d
freq,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [None]:
features['veil-type'].describe()

count     8124
unique       1
top          p
freq      8124
Name: veil-type, dtype: object

In [None]:
features.drop('veil-type', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.drop('veil-type', axis=1, inplace=True)


In [None]:
features.isnull().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [None]:
'''
bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,
missing=?
'''
features['stalk-root'].unique()

array(['e', 'c', 'b', 'r', nan], dtype=object)

In [None]:
features['stalk-root'].fillna('m', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['stalk-root'].fillna('m', inplace=True)


In [None]:
full_df = features.copy()
full_df['poisonous'] = target['poisonous']

In [None]:
mushrooms_gain_ratio = pd.DataFrame(index=full_df.columns.drop('poisonous'))
mushrooms_gain_ratio['gain_ratio'] = mushrooms_gain_ratio.index.map(lambda col: gain_ratio(full_df, 'poisonous', col))

In [None]:
mushrooms_gain_ratio.abs().style.background_gradient(cmap='coolwarm')

Unnamed: 0,gain_ratio
cap-shape,0.029522
cap-surface,0.018147
cap-color,0.014361
bruises,0.19644
odor,0.390648
gill-attachment,0.081818
gill-spacing,0.158154
gill-size,0.257946
gill-color,0.137597
stalk-shape,0.007616


In [None]:
gain_ratio_threshold = 0.2

In [None]:
good_cols = [col for col in features.columns if mushrooms_gain_ratio['gain_ratio'].loc[col] > gain_ratio_threshold]
good_cols

['odor',
 'gill-size',
 'stalk-surface-above-ring',
 'ring-type',
 'spore-print-color']

In [None]:
features = features.drop(columns=[col for col in features.columns if col not in good_cols])
features.columns

Index(['odor', 'gill-size', 'stalk-surface-above-ring', 'ring-type',
       'spore-print-color'],
      dtype='object')

In [None]:
binary_columns = [col for col in features.columns if features[col].nunique() == 2]
binary_columns

['gill-size']

In [None]:
if 'ring-number' in features.columns:
  features['ring-number'] = features['ring-number'].map({"n": 0, "o": 1, "t": 2})

In [None]:
features = pd.get_dummies(features, columns=[col for col in features.columns if col not in binary_columns + ['ring-number']], drop_first=False)

In [None]:
features = pd.get_dummies(features, columns=binary_columns, drop_first=True)

In [None]:
target = pd.get_dummies(target, columns=['poisonous'], drop_first=True)

In [None]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   poisonous_p  8124 non-null   bool 
dtypes: bool(1)
memory usage: 8.1 KB


In [None]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   odor_a                      8124 non-null   bool 
 1   odor_c                      8124 non-null   bool 
 2   odor_f                      8124 non-null   bool 
 3   odor_l                      8124 non-null   bool 
 4   odor_m                      8124 non-null   bool 
 5   odor_n                      8124 non-null   bool 
 6   odor_p                      8124 non-null   bool 
 7   odor_s                      8124 non-null   bool 
 8   odor_y                      8124 non-null   bool 
 9   stalk-surface-above-ring_f  8124 non-null   bool 
 10  stalk-surface-above-ring_k  8124 non-null   bool 
 11  stalk-surface-above-ring_s  8124 non-null   bool 
 12  stalk-surface-above-ring_y  8124 non-null   bool 
 13  ring-type_e                 8124 non-null   bool 
 14  ring-typ

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

## Custom

In [None]:
model = MyModel([MyLayer(X_train.shape[1], 5, activation=dsigmoid), MyLayer(5, 1, activation=dsigmoid)])

In [None]:
model.feed(X_test.astype('float32').values[0])

array([0.9273488])

In [None]:
model.eval(X_test.astype('float32').values, y_test.astype('float32').values)

{'loss': 0.45191227905050907}

In [None]:
model.fit(X_train.astype('float32').values, y_train.astype('float32').values, 100, 1, learning_rate=0.001)

Epoch 0 complete! Loss: {'loss': 0.2500732341144309}
Epoch 1 complete! Loss: {'loss': 0.2469317811787409}
Epoch 2 complete! Loss: {'loss': 0.24501244090235694}
Epoch 3 complete! Loss: {'loss': 0.2424857862290137}
Epoch 4 complete! Loss: {'loss': 0.23894353523330372}
Epoch 5 complete! Loss: {'loss': 0.2332275496416586}
Epoch 6 complete! Loss: {'loss': 0.22402781101839297}
Epoch 7 complete! Loss: {'loss': 0.20949914356388627}
Epoch 8 complete! Loss: {'loss': 0.1892625272748008}
Epoch 9 complete! Loss: {'loss': 0.16638768340466645}
Epoch 10 complete! Loss: {'loss': 0.14501975525092142}
Epoch 11 complete! Loss: {'loss': 0.127032581263524}
Epoch 12 complete! Loss: {'loss': 0.1124416582463571}
Epoch 13 complete! Loss: {'loss': 0.10056433348413676}
Epoch 14 complete! Loss: {'loss': 0.09075518685026991}
Epoch 15 complete! Loss: {'loss': 0.08246565803689193}
Epoch 16 complete! Loss: {'loss': 0.07530694943917716}
Epoch 17 complete! Loss: {'loss': 0.06911516263131412}
Epoch 18 complete! Loss: {'l

In [None]:
accuracy([model.feed(x) for x in X_test.astype('float32').values], y_test.astype('float32').values)

0.9913846153846154

In [None]:
precision([model.feed(x) for x in X_test.astype('float32').values], y_test.astype('float32').values)

1.0

In [None]:
recall([model.feed(x) for x in X_test.astype('float32').values], y_test.astype('float32').values)

0.981888745148771

# Laptops

## Dataset

In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
!kaggle datasets download -d mrsimple07/laptoppriceprediction

Dataset URL: https://www.kaggle.com/datasets/mrsimple07/laptoppriceprediction
License(s): Apache 2.0
Downloading laptoppriceprediction.zip to /content
  0% 0.00/39.1k [00:00<?, ?B/s]
100% 39.1k/39.1k [00:00<00:00, 48.6MB/s]


In [None]:
!unzip laptoppriceprediction.zip

Archive:  laptoppriceprediction.zip
  inflating: Laptop_price.csv        


In [None]:
import pandas as pd

df = pd.read_csv("Laptop_price.csv")
df.head()

Unnamed: 0,Brand,Processor_Speed,RAM_Size,Storage_Capacity,Screen_Size,Weight,Price
0,Asus,3.830296,16,512,11.185147,2.641094,17395.093065
1,Acer,2.912833,4,1000,11.311372,3.260012,31607.605919
2,Lenovo,3.241627,4,256,11.853023,2.029061,9291.023542
3,Acer,3.806248,16,512,12.28036,4.573865,17436.728334
4,Acer,3.268097,32,1000,14.990877,4.193472,32917.990718


In [None]:
price_min = df['Price'].min()
price_max = df['Price'].max()
def unnorm_price(price):
  return price * (price_max - price_min) + price_min

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
discdf=df.copy()

In [None]:
discdf["Price"] = pd.qcut(discdf['Price'], q=5, labels=range(5))
# discdf["Price"] = pd.cut(discdf['Price'], bins=[0, 15000, 25000, 100000], labels=['cheap', 'medium', 'expensive'])

In [None]:
categorical = ['RAM_Size', 'Storage_Capacity', 'Brand']

In [None]:
for cat in categorical:
  discdf[cat] = discdf[cat].astype('category')

In [None]:
for col in df.columns:
  if col not in categorical + ['Price']:
    discdf[col] = pd.qcut(df[col], q=5, labels=range(5))

In [None]:
laptops_gain_ratio = pd.DataFrame(index=discdf.columns.drop('Price'))
laptops_gain_ratio['gain_ratio'] = laptops_gain_ratio.index.map(lambda col: gain_ratio(discdf, 'Price', col))

In [None]:
laptops_gain_ratio.abs().style.background_gradient(cmap='coolwarm')

Unnamed: 0,gain_ratio
Brand,0.006558
Processor_Speed,0.008412
RAM_Size,0.145987
Storage_Capacity,0.790761
Screen_Size,0.005757
Weight,0.003082


In [None]:
good_cols = [col for col in laptops_gain_ratio.index if laptops_gain_ratio['gain_ratio'].loc[col] > 0.1]
good_cols

['RAM_Size', 'Storage_Capacity']

In [None]:
df = df.drop(columns=[col for col in df.columns if col not in good_cols + ['Price']])
df.columns

Index(['RAM_Size', 'Storage_Capacity', 'Price'], dtype='object')

In [None]:
for col in [c for c in df.columns if c not in ["Brand"]]:
    df[col] = ((df[col] - df[col].min()) / (df[col].max() - df[col].min()))

In [None]:
if 'Brand' in df.columns:
  df = pd.get_dummies(df, columns=["Brand"], drop_first=False)
  df

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   RAM_Size          1000 non-null   float64
 1   Storage_Capacity  1000 non-null   float64
 2   Price             1000 non-null   float64
dtypes: float64(3)
memory usage: 23.6 KB


In [None]:
df.head()

Unnamed: 0,RAM_Size,Storage_Capacity,Price
0,0.428571,0.344086,0.353939
1,0.0,1.0,0.923946
2,0.0,0.0,0.028917
3,0.428571,0.344086,0.355609
4,1.0,1.0,0.9765


In [None]:
df.describe()

Unnamed: 0,RAM_Size,Storage_Capacity,Price
count,1000.0,1000.0,1000.0
mean,0.410714,0.441634,0.442537
std,0.392452,0.421288,0.37724
min,0.0,0.0,0.0
25%,0.142857,0.0,0.061924
50%,0.428571,0.344086,0.349613
75%,1.0,1.0,0.922286
max,1.0,1.0,1.0


In [None]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Custom

In [None]:
model = MyModel([MyLayer(X_train.shape[1], 10), MyLayer(10, 5, activation=dsigmoid), MyLayer(5, 1, activation=dsigmoid)])

In [None]:
unnorm_price(y_test.astype('float64').values[0])

31571.913188565733

In [None]:
unnorm_price(model.feed(X_test.astype('float64').values[0])[0])

31618.94613696061

In [None]:
model.eval(X_test.astype('float64').values, y_test.astype('float64').values)

{'loss': 0.0005421661561624253}

In [None]:
model.fit(X_train.astype('float64').values, y_train.astype('float64').values, 100, 100, learning_rate=1)

Epoch 0 complete! Loss: {'loss': 0.1468341918025962}
Epoch 1 complete! Loss: {'loss': 0.1456198253236923}
Epoch 2 complete! Loss: {'loss': 0.1444708643828606}
Epoch 3 complete! Loss: {'loss': 0.14401276645233163}
Epoch 4 complete! Loss: {'loss': 0.14384585331188743}
Epoch 5 complete! Loss: {'loss': 0.1429461093357166}
Epoch 6 complete! Loss: {'loss': 0.14456021273810954}
Epoch 7 complete! Loss: {'loss': 0.14189963348933662}
Epoch 8 complete! Loss: {'loss': 0.1414006648616828}
Epoch 9 complete! Loss: {'loss': 0.14427161347140874}
Epoch 10 complete! Loss: {'loss': 0.1406000139226665}
Epoch 11 complete! Loss: {'loss': 0.13925698524206007}
Epoch 12 complete! Loss: {'loss': 0.13809437494795548}
Epoch 13 complete! Loss: {'loss': 0.13651994265825915}
Epoch 14 complete! Loss: {'loss': 0.13406278587595286}
Epoch 15 complete! Loss: {'loss': 0.13048111252354228}
Epoch 16 complete! Loss: {'loss': 0.1249891031996207}
Epoch 17 complete! Loss: {'loss': 0.116737508765671}
Epoch 18 complete! Loss: {'lo