# **Лабораторная работа 2**

Для начала подключаем необходимые для дальнейшей работы модули. И их много.

In [12]:
import pandas
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import check_random_state
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
from collections import Counter

Заготавливаем функцию, которая будет отвечать за обучение, тестирование, подсчёт точности и др.

In [2]:
def DisplayMetrics(Method, X, Y, folds = 5, average = 'macro'):
  kf = StratifiedKFold(n_splits = folds, random_state = 128, shuffle = True)
  precision = np.zeros(folds)   
  recall = np.zeros(folds)  
  testAc = np.zeros(folds)
  trainAc = np.zeros(folds)
  for step, (trainI, valI) in enumerate(kf.split(X, Y)):
    TrX, TrY = X.loc[trainI].to_numpy(), Y.loc[trainI].to_numpy()
    ValX, ValY = X.loc[valI].to_numpy(), Y.loc[valI].to_numpy()
    Method.fit(TrX, TrY)
    PredY = Method.predict(ValX)
    PredTrY = Method.predict(TrX)
    precision[step] = precision_score(ValY, PredY, average = average)
    recall[step] = recall_score(ValY, PredY, average = average)
    trainAc[step] = accuracy_score(TrY, PredTrY)
    testAc[step] = accuracy_score(ValY, PredY)    
  print("precision:", precision.mean())
  print("recall:", recall.mean())
  print("train_accuracy:", trainAc.mean())
  print("test_accuracy:", testAc.mean())

## **Реализации из грязи и палок**

Логистическая регрессия

In [3]:
class LogReg():
  def __init__(self, accuracy = 0.01, iters = 1000):
    self.it = iters
    self.ac = accuracy

  def fit(self, X, y):
    X = self.AddIntercept(X)
    self.wei = np.zeros(X.shape[1])
    for _ in range(self.it):
      h = self.Sigmoid(np.dot(X, self.wei))
      grad = np.dot(X.T, (h - y)) / y.size
      self.wei -= self.ac * grad
    pass

  def predict(self, X):
    X = self.AddIntercept(X)
    return self.Sigmoid(np.dot(X, self.wei))

  def Sigmoid(self, x):
    return 1 / (1 + np.exp(-x))

  def AddIntercept(self, X):
    return np.concatenate((np.ones((X.shape[0], 1)), X), axis = 1)

KNN

In [4]:
class KNN():
  def __init__(self, neighbors = 5):
    self.nei = neighbors
    
  def fit(self, X, y):
    self.X = X
    self.y = y.reshape((y.shape[0], 1))

  def predict(self, X):
    n = X.shape[0]
    prediction = np.zeros((n, 1))
    for i in range(n):
      d = self.Dist(X[i])
      sorted = self.y[np.argsort(d)].flatten()
      if sorted[:self.nei].sum() > self.nei / 2:
        prediction[i] = 1.0
    return prediction
	
  def Dist(self, p):
    t = self.X - p
    return np.sqrt((t ** 2).sum(1))

Дерево решений

In [22]:
class Node():
  def __init__(self, predType):
    self.predType = predType
    self.iFeature = 0
    self.border = 0
    self.left = None
    self.right = None

class DecisionTree():
  def __init__(self, mDepth = 1, rf = False):
    self.mDepth = mDepth
    self.rf = rf

  def fit(self, X, y, maxFeatures = None):
    self.types = len(set(y))
    if not self.rf:
      Features = X.shape[1]
    else:
      ind = np.random.choice(X.shape[0], X.shape[0])
      X, y = X[tuple([ind])], y[tuple([ind])]
      if maxFeatures is None:
        Features = np.sqrt(X.shape[1]).astype(int)
      else:
        Features = maxFeatures
    self.features = np.sort(np.random.choice(X.shape[1], Features, replace = False))
    self.tree = self.UpdateTree(X, y)

  def predict(self, X):
    list = []
    for inputs in X:
      node = self.tree
      while node.left:
        if inputs[node.iFeature] < node.border:
          node = node.left
        else:
          node = node.right
      list.append(node.predType)
    return list

  def Split(self, X, y):
    m = y.size
    if m <= 1:
      return None, None
    parent = [np.sum(y == c) for c in range(self.types)]
    bGini = 1.0 - sum((n / m) ** 2 for n in parent)
    bIdx, bThr = None, None
    for idx in self.features:
      borders, types = zip(*sorted(zip(X[:, idx], y)))
      left = [0] * self.types
      right = parent.copy()
      for i in range(1, m):
        c = types[i - 1]
        right[c] -= 1
        left[c] += 1
        giniLeft = 1.0 - sum((left[x] / i) ** 2 for x in range(self.types))
        giniRight = 1.0 - sum((right[x] / (m - i)) ** 2 for x in range(self.types))
        gini = (i * giniLeft + (m - i) * giniRight) / m
        if borders[i] == borders[i - 1]:
          continue
        if gini < bGini:
          bGini = gini
          bIdx = idx
          bThr = (borders[i] + borders[i - 1]) / 2
    return bIdx, bThr

  def UpdateTree(self, X, y, depth = 0):
    sPerClass = [np.sum(y == i) for i in range(self.types)]
    predType = np.argmax(sPerClass)
    node = Node(predType = predType)
    if depth < self.mDepth:
      idx, thr = self.Split(X, y)
      if idx is not None:
        Lidx = X[:, idx] < thr
        lx, ly = X[Lidx], y[Lidx]
        rx, ry = X[~Lidx], y[~Lidx]
        node.iFeature = idx
        node.border = thr
        node.left = self.UpdateTree(lx, ly, depth + 1)
        node.right = self.UpdateTree(rx, ry, depth + 1)
    return node

Случайный лес

In [6]:
class RandomForest():
    def __init__(self, mDepth=5, nEst=100, mFeatures=None):
        self.mDepth = mDepth
        self.mFeatures = mFeatures
        self.nEst = nEst
        self.forest = [None] * nEst

    def fit(self, X, Y):
        for i in range(self.nEst):
            self.forest[i] = DecisionTree(self.mDepth, rf=True) 
            self.forest[i].Fit(X, Y)

    def predict(self, X):
        mFreq = np.zeros(X.shape[0])
        preds = np.zeros((self.nEst, X.shape[0]))
        for i in range(self.nEst):
            preds[i] = self.forest[i].Predict(X)
        for i in range(len(mFreq)):
            mFreq[i] = Counter(preds[:, i]).mFreq(1)[0][0]
        return mFreq.astype(int)

# **Первый датасет - RGB**

Подготавливаем данные для последующей работы

In [24]:
RGB = pandas.read_csv("dataset1.csv")
needed = ["R", "G", "B"]
y = RGB["Type"].map({0 : 0, 1 : 1})
X = pandas.get_dummies(RGB[needed])

## **Реализации на sklearn**

Логистическая регрессия

In [25]:
%%time
DisplayMetrics(LogisticRegression(), X, y)

precision: 0.8637705964884009
recall: 0.822665881213337
train_accuracy: 0.9328375
test_accuracy: 0.93285
Wall time: 345 ms


KNN

In [9]:
%%time
DisplayMetrics(KNeighborsClassifier(n_neighbors=5), X, y) 

precision: 0.9827451699169805
recall: 0.9833814651078961
train_accuracy: 0.9960625000000001
test_accuracy: 0.9924
Wall time: 3.28 s


Дерево решений

In [21]:
%%time
DisplayMetrics(DecisionTreeClassifier(max_depth=5), X, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
Wall time: 91.3 ms


Случайный лес

In [13]:
%%time 
DisplayMetrics(RandomForestClassifier(n_estimators=50, max_depth=2), X, y) 

precision: 0.435675
recall: 0.5
train_accuracy: 0.87135
test_accuracy: 0.87135
Wall time: 1.32 s


## **Реализации из грязи и палок**

Логистическая регрессия

In [34]:
%%time
DisplayMetrics(LogReg(), X, y)

precision: 0.535696792948237 
recall: 0.5001945525291829 

train_accuracy: 0.8713750000000001 
test_accuracy: 0.8714000000000001
CPU times: user 4.07 s, sys: 2.88 s, total: 6.95 s
Wall time: 3.54 s


KNN

In [9]:
%%time
DisplayMetrics(KNN(neighbors = 3), X, y)

precision: 0.9821150044296946 
recall: 0.9815726516822272 

train_accuracy: 0.9966000000000002 
test_accuracy: 0.99185
CPU times: user 2min 52s, sys: 1.33 s, total: 2min 54s
Wall time: 2min 54s


Дерево решений

In [37]:
%%time
DisplayMetrics(DecisionTree(mDepth = 4), X, y)

precision: 1.0 
recall: 1.0 

train_accuracy: 1.0 
test_accuracy: 1.0
CPU times: user 5.18 s, sys: 7.03 ms, total: 5.19 s
Wall time: 5.21 s


Случайный лес

In [42]:
%%time
DisplayMetrics(RandomForest(), X, y)

precision: 0.435675 
recall: 0.5 

train_accuracy: 0.87135 
test_accuracy: 0.87135
CPU times: user 4min 39s, sys: 78.7 ms, total: 4min 39s
Wall time: 4min 40s


# **Второй датасет - YouTube**

Подготавливаем данные для последующей работы

In [15]:
youtube = pandas.read_csv("dataset2.csv")
needed = ["views", "likes", "dislikes", "comment_count"]
yy = youtube["type"].map({0 : 0, 1 : 1})
XX = pandas.get_dummies(youtube[needed])

## **Реализации на sklearn**

Логистическая регрессия

In [16]:
%%time
DisplayMetrics(LogisticRegression(), XX, yy)

precision: 0.4500862084313975
recall: 0.4997476030240396
train_accuracy: 0.35265818533180177
test_accuracy: 0.35256050770032943
Wall time: 910 ms


KNN

In [17]:
%%time
DisplayMetrics(KNeighborsClassifier(n_neighbors=5), XX, yy) 

precision: 0.9995092718766067
recall: 0.9996351444031477
train_accuracy: 0.999822950723009
test_accuracy: 0.9996092736451754
Wall time: 7.17 s


Дерево решений

In [23]:
%%time
DisplayMetrics(DecisionTreeClassifier(max_depth=15), XX, yy) 

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
Wall time: 189 ms


Случайный лес

In [19]:
%%time 
DisplayMetrics(RandomForestClassifier(n_estimators=50, max_depth=2), XX, yy)

precision: 0.9999811427493871
recall: 0.9999653739612189
train_accuracy: 0.9999938948075338
test_accuracy: 0.99997557997558
Wall time: 3.57 s


## **Реализации из грязи и палок**

Логистическая регрессия

In [17]:
%%time
DisplayMetrics(LogReg(), XX, yy)

precision: 0.17628025086811264 
recall: 0.5 

train_accuracy: 0.3525605020659815 
test_accuracy: 0.3525605017362253
CPU times: user 9.05 s, sys: 5.74 s, total: 14.8 s
Wall time: 7.51 s


KNN

In [18]:
%%time
DisplayMetrics(KNN(neighbors = 3), XX, yy)

precision: 0.9994875544239743 
recall: 0.9995501008737495 

train_accuracy: 0.9998717924490995 
test_accuracy: 0.9995604216681272
CPU times: user 12min 59s, sys: 5.16 s, total: 13min 4s
Wall time: 13min 5s


Дерево решений

In [41]:
%%time
DisplayMetrics(DecisionTree(mDepth = 3), XX, yy)

precision: 1.0 
recall: 1.0 

train_accuracy: 1.0 
test_accuracy: 1.0
CPU times: user 11.3 s, sys: 26 ms, total: 11.3 s
Wall time: 11.3 s


Случайный лес

In [43]:
%%time
DisplayMetrics(RandomForest(), XX, yy)

precision: 0.9543573550853347 
recall: 0.9429624748421712 

train_accuracy: 0.9551026322734861 
test_accuracy: 0.953308036585604
CPU times: user 17min 25s, sys: 1.88 s, total: 17min 27s
Wall time: 17min 28s
