In [None]:
drive.mount("/content/gdrive/")
!ls "/content/gdrive/MyDrive/ee/"

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
!ls "/content/gdrive/MyDrive/EEE485/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
ls: cannot access '/content/gdrive/MyDrive/EEE485/': No such file or directory


In [1]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import randrange
from random import seed
from google.colab import drive

In [2]:
def summary(data):
    # Looking first few rows of the data
    print(data.head())

    # Looking for the data and missing values
    print(data.info())

    # Looking the summary of the data
    print(data.describe())
    return None

def cleaner(data):
    data = data.iloc[:, 1:] # Deselecting first column
    data.index.name = 'id' # Renaming index of the first column
    data.dropna(inplace = True) # Removing null values
    data['age'] = data['age']/365 # Transforming age into years
    data = data[(data.height>=140) & (data.height<=220)] # Filtering height to be at least 140cm or at most 220cm
    data = data[(data.ap_hi>=50) & (data.ap_hi<=1000)] # Filtering systolic pressure to be at least 50mmHg or at most 300mmHg
    data = data[(data.ap_lo>=20) & (data.ap_lo<=1000)] # Filtering diastolic pressure to be at least 20mmHg or at most 300mmHg
    data = data[data.weight>=40] # Filtering weight to be at least 40
    
    # Power transform the data

    return data

def visualizer(data):

    # Scatter plot with cardio against height
    # plt.scatter(data['cardio'], data['height'], c=data['gender'])
    
    # Adding Title to the Plot
    # plt.title("Scatter Plot")
    
    # Setting the X and Y labels
    # plt.xlabel('cardio')
    # plt.ylabel('height')
    
    # plt.colorbar()

    # plt.show()
    
    pd.crosstab(data['cholesterol'], data['cardio']).plot.bar(stacked=False)
    plt.figure(1)
    pd.crosstab(data['active'], data['cardio']).plot.bar(stacked=False)
    plt.figure(2)
    pd.crosstab(data['gluc'], data['cardio']).plot.bar(stacked=False)
    plt.figure(3)
    pd.crosstab(data['smoke'], data['cardio']).plot.bar(stacked=False)
    plt.figure(4)
    pd.crosstab(data['alco'], data['cardio']).plot.bar(stacked=False)
    plt.figure(5)
    pd.crosstab(data['gender'], data['cardio']).plot.bar(stacked=False)
    plt.show()

    return None

def splitter(seed, data, train = 0.60, val= 0.20):
    if not isinstance(data, (np.ndarray)):
      data = data.to_numpy()

    np.random.seed(seed)
    np.random.shuffle(data)

    indexOfTrain = int(len(data)*train)
    indexOfVal = int(len(data)*(train+val))

    train = data[:indexOfTrain]
    val = data[indexOfTrain:indexOfVal]
    test = data[indexOfVal:]

    return train, val, test

def normalization(datas): # takes numpy array
  stds = np.std(datas, axis=0)
  means = np.mean(datas,axis=0)
  for i in range(11):
    datas[:,i] = (datas[:,i] - means[i])/stds[i]
  return datas
# pd.set_option('display.max_rows', None, 'display.max_columns', None) # Viewing the output without truncation
data = pd.read_csv("cardio_train.csv", header= 0, sep= ";")

def label_design(data): # takes numpy array
  labels = data[:, 11]
  datas = np.delete(data, 11, 1)
  return datas, labels

In [None]:
# Looking at the data
summary(data)

# Cleaning the data
data = cleaner(data)

# Looking once again
summary(data) # Now data looks better

# Visualisation of the data
visualizer(data)

# Splitting data
train, val, test= splitter(1, data)
# print(np.shape(train))
# print(np.shape(test))

Logistic Regression

In [None]:
class lr:
  def __init__(self, learning_rate=1e-4, threshold = 0.5):
    self.learning_rate = learning_rate
    self.threshold = threshold
    self.w = None
  def initialize_weight(self,dim):
    self.w = np.zeros((dim,))
    return None

  def sigmoid(self, x): 
    a = np.exp(np.dot(x,self.w))
    return a/(1 + a)

  def fit(self, x_train, y_train):
    dim = x_train.shape[0]
    i = 0
    pre_loss = 0
    loss = 1
    while abs(loss - pre_loss) > 0.01:

      i += 1
      pre_loss = loss
      gradient = (np.dot(x_train.T, (self.sigmoid(x_train) - y_train)))/dim
      self.w = self.w - gradient*self.learning_rate
      acc = self.accuracy(y_train, self.sigmoid(x_train) > 0.5 )
      y_prob = self.sigmoid(x_train)
      loss = np.sum(-y_train*np.log(y_prob)-(1-y_train)*np.log(y_prob))
      #print(np.linalg.norm(gradient))
      print(i)
      print("Loss: ",loss)
      print("Accuracy: ",acc)

  def test(self, x_test):
    return self.sigmoid(x_test) > self.threshold 

  def accuracy(self,y_test,y_est):
    return np.sum(y_est == y_test)/np.size(y_test)

In [None]:
x_train, y_train = label_design(normalization(train))
x_val, y_val = label_design(normalization(val))
x_test, y_test = label_design(normalization(test))
model = lr(1,0.5)
model.initialize_weight(11)
model.fit(x_train, y_train)


In [None]:
accs = []
for i in range(100):
  model.threshold = 0.4 + i/500
  y_est = model.test(x_val)
  accs.append(np.sum(y_est == y_val)/np.size(y_val))
plt.plot(np.linspace(0.4, 0.6, num=100),accs)
plt.ylabel('accuracy')
plt.xlabel('threshold')
plt.show()

index_max = np.argmax(accs, axis=0)
print("Threshold: ", 0.4 + index_max/500)
print("Accuracy: ",np.max(accs))

In [None]:
model.threshold = 0.5
y_result = model.test(x_test)
print("Test Accuracy: ",model.threshold,model.accuracy(y_test,y_result))
model.threshold = 0.45
y_result = model.test(x_test)
print("Test Accuracy: ",model.threshold,model.accuracy(y_test,y_result))

SVM - Linear

In [28]:
class svm:
    # Linear Soft Margin SVM
    def __init__(self):
        self.w = None

    def fit(self, x_train, y_train, x_val, y_val, learning_rate=1e-5, epoch=30, regularization_=0.1):
        val_hist = []
        w_list = []
        y_ = (y_train-0.5)*2
        self.w = np.zeros(x_train.shape[1])

        for i in range(epoch):
            print(i,"Train Accuracy: ",self.accuracy(y_train,self.test(x_train)))
            for i in range(len(x_train)):
                if (y_[i] * (np.dot(x_train[i], self.w.T))) >= 1:
                    self.w = self.w - learning_rate * (2 * regularization_ * self.w)
                else:
                    self.w = self.w - learning_rate * (2 * regularization_ * self.w - np.dot(x_train[i], y_[i]))
            val_hist.append(self.accuracy(y_val,self.test(x_val)))
            w_list.append(self.w)
        return val_hist

    def test(self, x_test):
      test_result = []
      for i in range(len(x_test)):
        test_result.append(np.dot(x_test[i], self.w))
      
      return np.sign(np.array(test_result))

    def accuracy(self,y_test,y_est):
      return 100*np.sum(y_est == y_test)/np.size(y_test)

In [None]:
x_train, y_train = label_design(normalization(train))
x_val, y_val = label_design(normalization(val))
x_test, y_test = label_design(normalization(test))
model = svm()
hist = model.fit(x_train, (y_train-0.5)*2, x_val, (y_val-0.5)*2, learning_rate=1e-6, epoch=200, regularization_=0.01)

In [None]:
plt.plot(np.arange(1,len(hist)+1),hist)
plt.show()

epoch_max = np.argmax(hist, axis=0)
print("epoch: ", epoch_max)
print("Accuracy: ",np.max(hist))

In [None]:
model_fin = svm()
hist_fin = model_fin.fit(x_train, (y_train-0.5)*2, x_test, (y_test-0.5)*2, learning_rate=1e-6, epoch=epoch_max, regularization_=0.01)
print("Test Accuracy: ",hist_fin[-1])

SVM with sklearn

In [None]:
from sklearn import linear_model, svm, discriminant_analysis, metrics
from scipy import optimize
x_train, y_train = label_design(normalization(train))
x_val, y_val = label_design(normalization(val))
x_test, y_test = label_design(normalization(test))

model = svm.SVC(kernel='rbf', C=10, gamma=1.5, shrinking=False)
model.fit(x_train, y_train);

In [None]:
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
x_train, y_train = label_design(normalization(train))
x_val, y_val = label_design(normalization(val))
x_test, y_test = label_design(normalization(test))
for i in np.arange(100, 1001, 100):
  rbf_feature = RBFSampler(gamma=0.1, n_components=i, random_state=1)
  X_features = rbf_feature.fit_transform(x_train)
  clf = SGDClassifier(max_iter=1000)
  clf.fit(X_features, y_train)
  xt_features = rbf_feature.fit_transform(x_val)
  y_est = clf.predict(xt_features)
  accuracy = np.sum(y_est == y_val)/np.size(y_val)
  print(i," :",accuracy)
#print(accuracy)

In [None]:
xt_features = rbf_feature.fit_transform(x_test)
y_est = clf.predict(xt_features)
accuracy = np.sum(y_est == y_test)/np.size(y_test)
print(accuracy)

Decision Tree

In [None]:
def gini_index(groups, classes):

    numOfInstances = float(sum([len(group) for group in groups]))
    gini = 0.0

    for group in groups:
        size = float(len(group))
        
        if size == 0:
            continue
        score = 0.0

        for class_value in classes:
            p = [row[-1] for row in group].count(class_value) / size
            score += p**2
        
        gini += (1.0 - score) * (size/numOfInstances)
    
    return gini

def feature_split(index, threshold, dataset):
    below, above = list(), list()

    for row in dataset:
        if row[index] < threshold:
            below.append(row)
        else:
            above.append(row) # Greater than or equal to values

    return below, above

def find_split(dataset):
	classes = list(set(row[-1] for row in dataset))
    
	best_col, best_value, best_score, best_groups = 999999, 999999, 999999, None
	for col in range(len(dataset[0])-1):
		for row in dataset:
			groups = feature_split(col, row[col], dataset)
			gini = gini_index(groups, classes)

			if gini < best_score:
				best_col, best_value, best_score, best_groups = col, row[col], gini, groups

	return {'index':best_col, 'value':best_value, 'groups':best_groups}

def terminal(subset):
	result = [row[-1] for row in subset]
	return max(set(result), key = result.count)

def child_split(node, maxDepth, minSize, depth):
    below, above = node['groups']
    del(node['groups'])

    if not below or not above:
        node['below'] = node['above'] = terminal(above + below)
        return
    if depth >= maxDepth:
        node['below'], node['above'] = terminal(below), terminal(above)

    if len(below) <= minSize:
        node['below'] = terminal(below)
    else:
        node['below'] = find_split(below)
        child_split(node['below'], maxDepth, minSize, depth+1)
    
    if len(above) <= minSize:
        node['above'] = terminal(above)
    else:
        node['above'] = find_split(above)
        child_split(node['above'], maxDepth, minSize, depth+1)

def decision_Tree(train, maxDepth, minSize): # Random Forest might be implemented in the final because of bias and overfitting possibilities
    rootNode = find_split(train)
    child_split(rootNode, maxDepth, minSize, 1)
    return rootNode

def treeShow(node, depth = 0):
    if isinstance(node, dict):
        print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
        treeShow(node['below'], depth+1)
        treeShow(node['above'], depth+1)
    else:
        print('%s[%s]' % ((depth*' ', node)))

def treePredict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['below'], dict):
            return treePredict(node['below'], row)
        else:
            return node['below']
    else:
        if isinstance(node['above'], dict):
            return treePredict(node['above'], row)
        else:
            return node['above']

def main_tree(train, test, max_depth, min_size):
	tree = decision_Tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = treePredict(tree, row)
		predictions.append(prediction)
	return(predictions)

def crossSplitter(data, numFold):
	dataset_split = list()
	dataset_copy = list(data)
	fold_size = int(len(data) / numFold)
	for i in range(numFold):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

def accuracyCalc(ground, prediction):
	trueCount = 0
	for i in range(len(ground)):
		if ground[i] == prediction[i]:
			trueCount += 1
	return  100.0 * trueCount / float(len(ground))

def accuracyChecker(dataset, classifier, numFolds, *args):
	folds = crossSplitter(dataset, numFolds)
	accuracyValues = list()
	for j in folds:
		trainingSet = list(folds)
		trainingSet.remove(j)
		trainingSet = sum(trainingSet, [])
		testingSet = list()
		for row in j:
			row_copy = list(row)
			testingSet.append(row_copy)
			row_copy[-1] = None
		prediction = classifier(trainingSet, testingSet, *args)
		ground = [row[-1] for row in j]
		accuracy = accuracyCalc(ground, prediction)
		accuracyValues.append(accuracy)
	return accuracyValues


seed(5)

os.chdir('C:/Users/senih/Desktop/Coding/VS/Python/ee485')
dataset = pd.read_csv("cardio_train.csv", header= 0, sep= ";")

dataset = dataset.to_numpy()

dataset = dataset[:1000, :] # We will focus on efficiency of the algorithm to make it faster
# print(np.shape(dataset))

dataset = dataset.tolist()

numFolds = 5
maxDepth = 5
minSize = 10

accuracyValues = accuracyChecker(dataset, main_tree, numFolds, maxDepth, minSize)
print('accuracyValues: %s' % accuracyValues)
print('Mean Accuracy: %.3f%%' % (sum(accuracyValues)/float(len(accuracyValues))))

SVM - Kernel (Not in use)

In [None]:
import numpy as np 
import cvxopt
import cvxopt.solvers

class SVM():
  def __init__(self,polyconst=1,gamma=10,degree=2):
    self.polyconst = float(1)
    self.gamma = float(gamma)
    self.degree = degree
    self._support_vectors = None
    self._alphas = None
    self.intercept = None
    self._n_support = None
    self._support_labels = None
    self._indices = None

  def transform(self,X):
    K = np.zeros([X.shape[0],X.shape[0]])
    for i in range(X.shape[0]):
      print(i)
      for j in range(X.shape[0]):
        K[i,j] = np.exp(-1.0*self.gamma*np.dot(np.subtract(X[i],X[j]).T,np.subtract(X[i],X[j])))
    return K

  def fit(self,data,labels):
    num_data, num_features = data.shape
    labels = labels.astype(np.double)
    alphas = np.ravel(cvxopt.solvers.qp(cvxopt.matrix(np.outer(labels,labels)*self.transform(data)),
                                        cvxopt.matrix(np.ones(num_data)*-1),
                                        cvxopt.matrix(labels,(1,num_data)),
                                        cvxopt.matrix(0.0),
                                        cvxopt.matrix(np.diag(np.ones(num_data) * -1)),
                                        cvxopt.matrix(np.zeros(num_data)))['x'])
    is_sv = alphas>1e-5
    self._support_vectors = data[is_sv]
    self._n_support = np.sum(is_sv)
    self._alphas = alphas[is_sv]
    self._support_labels = labels[is_sv]
    self._indices = np.arange(num_data)[is_sv]
    self.intercept = 0
    for i in range(self._alphas.shape[0]):
      self.intercept += self._support_labels[i] 
      self.intercept -= np.sum(self._alphas*self._support_labels*K[self._indices[i],is_sv]) 
    self.intercept /= self._alphas.shape[0]
    
  def signum(self,X):
    return np.where(X>0,1,-1)

  def project(self,X):
    score = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
      s = 0
      for alpha,label,sv in zip(self._alphas,self._support_labels,self._support_vectors):
        s += alpha*label*np.exp(-1.0*self.gamma*np.dot(np.subtract(X[i],sv).T,np.subtract(X[i],sv)))
      score[i] = s
    score = score + self.intercept
    return score

  def predict(self,X):
    return self.signum(self.project(X))

In [None]:
set1, data1 = splitter(1, data, 0.01)
set2, data2 = splitter(1, data1, 0.01)
x_train, y_train = label_design(normalization(set1))
x_test, y_test = label_design(normalization(set2))
model = SVM(gamma=3)
model.fit(x_train,y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
y_guess = model.predict(x_test)
control_param = 0
result = np.sum(y_guess == y_test)/np.size(y_test)
print(result)

Others (Not in use)

In [None]:
X = 35000
K = np.zeros([X,X])

KNN

In [None]:
def Euclidean_distance(x1,x2):
  return math.sqrt(np.sum((x1 - x2)**2))

In [None]:
def knn(x_train, y_train, test, k): # takes one test instance and returns one label
# we need to make this weighted.
  distance = []
  for i in range(len(x_train)-1):
    distance.append(Euclidean_distance(x_train[i],test))
  dist_arr = np.array(distance) 
  array_inds = dist_arr.argsort()
  #sorted_dist = dist_arr[array_inds[::-1]]
  y_train_sorted = y_train[array_inds[::-1]]
  label_sum = np.sum(y_train_sorted[0:k-1])
  label = 0
  if label_sum/k > 0:
    label = 1
  return label

In [None]:
x_train, y_train = label_design(normalization(train))
x_test, y_test = label_design(normalization(test))
accurate = 0
trained_labels = []
for i in range(x_test.shape[0]):
  trained_labels.append(knn(x_train, y_train, x_test[i], int(math.sqrt(len(y_train)))))
  if (trained_labels[i] == y_test[i]):
    accurate += 1
    print(i, accurate/(i+1))

NameError: ignored