### Describtion
- In this Notebook i'm going to demonstrate how to design an abstracted model to fit data using Linear Gaussian Mixture and Logistic Regression Models.
- I will note how different systax or tools affect the speed of code.
- I will demonstrate how to maximize training and testing accuracy.
- #### *Outlines*:
    - [Import Libraries](#import-libraries)


### Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import time
from tqdm import tqdm_notebook
from abc import ABC, abstractmethod

In [2]:
#DataFetcher
class DataFetcher:
    """
    Parse training and testing data from specific dirctory
    """
    def __init__(self, directory, data_name, labels_name):
        self.directory = directory
        self.data_name = data_name
        self.labels_name = labels_name

    #Validate paramters 
    @property
    def directory(self):
        self._directory
    
    @directory.setter
    def directory(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._directory = string

    @property
    def data_name(self):
        self._data_name
    
    @data_name.setter
    def data_name(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._data_name = string

    
    @property
    def labels_name(self):
        self._labels_name
    
    @labels_name.setter
    def labels_name(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._labels_name = string
            
    #get directory
    def _get_training_data_path(self, subset_number):
        return "./%s/train%s%d.csv" %(self._directory, self._data_name, subset_number+1)
    
    def _get_training_labels_path(self, subset_number):
        return "./%s/train%s%d.csv" %(self._directory, self._labels_name, subset_number+1)
 
    
    def get_all_training_data(self):
        training_data_dfs = []
        training_labels_dfs = []

        for subset in range(SUBSETS):
            data_path = self._get_training_data_path(subset)
            labels_path = self._get_training_labels_path(subset)

            training_data_dfs.append(pd.read_csv(data_path, header=None))
            training_labels_dfs.append(pd.read_csv(labels_path, header=None))

        return training_data_dfs, training_labels_dfs
    
    def get_all_testing_data(self):

        data_path = "./%s/test%s.csv" %(self._directory, self._data_name)
        labels_path = "./%s/test%s.csv" %(self._directory, self._labels_name)

        testing_data_df = pd.read_csv(data_path, header=None)
        testing_labels_df = pd.read_csv(labels_path, header=None)

        return testing_data_df, testing_labels_df
        
    


In [3]:
# class CrossValidation:
#     """
#     para*:
#         data_dfs : list of folds*dataframe, each data frame is batch of m exambles
#         labels_df : Dataframe contains sigle col which is labels
#     """
#     def __init__(self, data_dfs, labels_dfs):
#         assert(len(data_dfs) == len(labels_dfs))
#         self.data_dfs = data_dfs
#         self.labels_dfs = labels_dfs
#         self.num_subsets = len(data_dfs)        

#     def _split_dfs(self, dfs, subset):
#         validation_df = dfs[subset]
#         training_dfs = dfs[:subset] + dfs[subset+1:]

#         training_df = pd.concat(training, ignore_index=True)

#         return training_df, validation_df

#     def _get_cross_validation(self, subset):
#         training_x_set, validation_x_set =  _split_dfs(self.data_dfs, subset)
#         training_y_set, validation_y_set = _split_dfs(self.labels_dfs, subset)
         
#         return training_x_set, training_y_set, validation_x_set, validation_y_set
    
#     def get_training_validation(self, subset):
#         training_x, training_y, validation_x, validation_y = self._get_cross_validation(subset)

#         return training_x, training_y, validation_x, validation_y
    
#     def get_training(self):
#         training_x_set = pd.concat(self.data_dfs, ignore_index=True)
#         training_y_set = pd.concat(self.labels_dfs, ignore_index=True)
#         return training_x_set, training_y_set


In [4]:
#Optimized version of Cross validation
class CrossValidation:
    """
    para*:
        data_dfs : list of folds*dataframe, each data frame is batch of m exambles
        labels_df : Dataframe contains sigle col which is labels
    """
    def __init__(self, data_dfs, labels_dfs):
        self.X = data_dfs
        self.Y = labels_dfs
        self.num_subsets = len(data_dfs)        

    #split data
    def _split_training_validation(self, dfs, subset):
        validation_df = dfs[subset]
        training_df = pd.concat(dfs[:subset] + dfs[subset+1:], ignore_index=True)
        return [training_df, validation_df] 
    
    def _get_x_training_validation(self, subset):
        [training_X, validation_X] = self._split_training_validation(self.X, subset)
        return [training_X, validation_X]

    def _get_y_training_validation(self, subset):
        [training_Y, validation_Y] = self._split_training_validation(self.Y, subset)
        return [training_Y, validation_Y]

    
    def get_training_validation(self, subset):
        [training_x, validation_x] = self._get_x_training_validation(subset)
        [training_y, validation_y] = self._get_y_training_validation(subset)
        return [training_x, training_y, validation_x, validation_y]
        

        return [training_x, training_y, validation_x, validation_y]
    
    def get_training(self):
        training_x_set = pd.concat(self.X[:], ignore_index=True)
        training_y_set = pd.concat(self.Y[:], ignore_index=True)
        return [training_x_set, training_y_set]

In [5]:
class Model(ABC):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    
    @abstractmethod
    def predict(self, x):
        pass
    
    def predict_df(self, x_df):
        predictios = x_df.apply(lambda row: self.predict(row), raw=True, axis=1)
        return predictios

In [6]:
# def get_accuracy(true_labels, predicted_labels):
#     assert(len(true_labels) == len(predicted_labels))
#     return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)

def get_accuracy(true_labels, predicted_labels):
    assert(len(true_labels) == len(predicted_labels))
    return np.sum(true_labels ==  predicted_labels) / len(true_labels)

In [7]:
directory = 'knn-dataset'
name_x = 'Data'
name_y = 'Labels'
SUBSETS = 10
POSITIVE_LABEL = 5
NEGATIVE_LABEL = 6

DF = DataFetcher(directory, name_x, name_y)
testing_X, testing_Y = DF.get_all_testing_data()
training_X, training_Y = DF.get_all_training_data()

CV = CrossValidation(training_X, training_Y)
tr_x, tr_y = CV.get_training()


In [48]:
tr_y_np = tr_y.to_numpy()
tr_y_np_tf = tr_y_np == 5
tr_y_np_tf2 = [1 if y_i == 5 else 0 for y_i in tr_y_np]

trx_np = np.insert(tr_x.to_numpy(), 0, 1, axis=1)
trx_np

array([[ 1,  0,  0, ...,  5,  0,  0],
       [ 1,  0,  0, ...,  3,  0, 16],
       [ 1,  9, 16, ..., 16, 16,  3],
       ...,
       [ 1,  7,  0, ...,  0,  2,  4],
       [ 1,  0,  0, ...,  0,  3,  3],
       [ 1,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [8]:
class GaussianMixture(Model):
    def __init__(self, training_x, training_y, POSITIVE_LABEL, NEGATIVE_LABEL):
        super().__init__(training_x, training_y)
        self.POSITIVE_LABEL = POSITIVE_LABEL
        self.NEGATIVE_LABEL = NEGATIVE_LABEL

        #pi, mu_1, mu_2, s1, s2, cov, cov_inv, w0, w1
        
        #N1(number of points for class 1)
        #N2(number of points for class 2)
        #N(number of points for both classes)

        #Dealing with flatten values in list is much faster than numpy array  
        # y_list = training_y.values.flatten().tolist()
        # pi = sum([1 if  i == self.POSITIVE_LABEL  else 0 for i in y_list ]) / len(y_list)
        
        pi = np.sum(training_y.to_numpy() == POSITIVE_LABEL) / len(training_y)

        positive_indices = np.where(tr_y.to_numpy() == POSITIVE_LABEL)
        negative_indices = np.where(tr_y.to_numpy() == NEGATIVE_LABEL)

        #positive_indices = [index for index, i in enumerate(y_list) if i == POSITIVE_LABEL]    #get indices of positive labels
        #negative_indices = [index for index, i in enumerate(y_list) if i == NEGATIVE_LABEL]    #get indices of negative labels

        N1 = len(positive_indices)  #Length of positive items 
        N2 = len(negative_indices)  #Length of negetive items 
        N = N1 + N2                 #Length of all items

        positive_x = training_x.iloc[positive_indices[0]].to_numpy()
        negative_x = training_x.iloc[negative_indices[0]].to_numpy()

        mu1 = positive_x.mean(axis=0)   #mean over cols, so broadcasting is done (500, 64)
        mu2 = negative_x.mean(axis=0)


        positive_x_dists = positive_x - mu1
        negative_x_dists = negative_x - mu2

        s1 = positive_x_dists.T.dot(positive_x_dists)/N1
        s2 = negative_x_dists.T.dot(negative_x_dists)/N2

        cov = ((N1/N) * s1) + ((N2/N) * s2)
        cov_inv = np.linalg.inv(cov)

        w = cov_inv.dot(mu1 - mu2)
        w0 =  -(1/2) * mu1.T.dot(cov_inv).dot(mu1) + (1/2) * mu2.T.dot(cov_inv).dot(mu2) + np.log(pi/ (1-pi))

        self.pi = pi 
        self.mu1 = mu1
        self.mu2 = mu2 
        self.N1 = N1
        self.N2 = N2
        self.cov = cov
        self.cov_inv = cov_inv 
        self.w = w 
        self.w0 = w0 

    def predict_prob(self, x):
        logits = self.w.dot(x) + self.w0
        prob = 1 / (1 + np.exp(-logits))
        return prob
    
    def predict(self, x):
        prob = self.predict_prob(x)
        if prob > 0.5:
            return self.POSITIVE_LABEL
        else:
            return self.NEGATIVE_LABEL



### Training and testing

In [32]:
# Create hypothesis
t1 = time.time()
GMM = GaussianMixture(tr_x, tr_y, POSITIVE_LABEL, NEGATIVE_LABEL)
t2 = time.time()

# Training Accuracy
predicted_train_y = GMM.predict_df(tr_x)
train_accuracy = get_accuracy(tr_y.values.flatten(), predicted_train_y.to_numpy())

# Test Accuracy
predicted_test_y = GMM.predict_df(testing_X)
test_accuracy = get_accuracy(testing_Y.values.flatten(), predicted_test_y.values.flatten())

print("Train Accuracy: %f" % train_accuracy)
print("Test Accuracy: %f" % test_accuracy)
print("Training Time: " + str(t2-t1))

Train Accuracy: 0.884000
Test Accuracy: 0.890909
Training Time: 0.0031538009643554688


## Logistic Regression Model

In [64]:
class logisticRegression(Model):
    def __init__(self, training_x, training_y, PositiveLabel, NegativeLabel, lmbda, max_iter=10, train_inercept=True, threshold=0.01):
        super().__init__(training_x, training_y)

        self.positive_label = PositiveLabel
        self.negative_label = NegativeLabel
        self.lmbda = lmbda
        self.max_iter = max_iter
        self.train_inercept = train_inercept


        self.X = training_x.to_numpy()
        self.Y = training_y.to_numpy()

        self.Y = (self.Y == self.positive_label)

        if train_inercept:
            self.X = np.insert(self.X, 0, 0, axis=1)

        self.w = np.zeros(self.X.shape[1])

        self._train(self.X, self.w, self.Y)

    def _sigmoid(self, odds):
        prob = 1 / (1 + np.exp(-odds))
        return prob
    
    def _gradient(self, x, w, y):
        weighted_values = x.dot(w)
        probabilities = self._sigmoid(weighted_values)
        return x.T.dot(probabilities - y) + self.lmbda * w
    
    def _inv_prob(self, prob):
        return 1 - prob
    
    def _R(self, x, w):
        weighted_values = x.dot(w)
        probabilities = self._sigmoid(weighted_values)
        inv_probabilities = self._inv_prob(probabilities)

        return np.diag(np.multiply(probabilities, inv_probabilities))

    def _hassian(self, x, w):
        R = self._R(x, w)
        H = x.T.dot(R).dot(X)
        return H + self.lmbda * np.identity(len(H))
    
    def _train(self, x, w, y):
        iters = 0
        while(True):
            if iters >= self.max_iters:
                break
            
            gradient = self._gradient(x, w, y)
            hessian = self._hessian(x, w)
            hessian_inv = np.linalg.inv(hessian)
            
            w_new = w - hessian_inv.dot(gradient)
            
            dist = norm(w_new - w)
            
            w = w_new

            if dist < self.threshold:
                break

            iters += 1

        self.w = w

    def predict_prob(self, x):
        if self.train_intercept:
            x = np.insert(x, 0, 1)
        odds = self.w.dot(x) 
        return self._sigmoid(odds)
    
    def predict(self, x):
        prob = self.predict_prob(x)
        if prob > 0.5:
            return self.positive_label
        else:
            return self.negative_label





In [None]:
LAMBDAS = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000]

def perform_logreg_CV():

    average_accuracies = []
    for lmbda in tqdm_notebook(LAMBDAS):
        # Perform CV
        accuracies = []
        for j in range(NUM_SUBSETS):
            train_X, train_y, validation_X, validation_y = CV.get_training_validation(j)
            
            # Create hypothesis
            model = LogisticRegression(train_X, train_y, POSITIVE_LABEL, NEGATIVE_LABEL, lmbda)
            predicted_validation_y = model.predict_df(validation_X)

            # Get accuracy
            accuracy = get_accuracy(validation_y.values.flatten(), predicted_validation_y.values.flatten())
            accuracies.append(accuracy)

        avg_accuracy = np.mean(accuracies)
        average_accuracies.append(avg_accuracy)

    return average_accuracies

#### Problem-1: Validate class parameters to meet some limitation
- We will use ```@property``` decorator; a pythonic way to use getters and setters in object-oriented programming.
- Look at [Validate Class Example](#validate-class-example).
- #### References:
    - [Here]()
    - A good [tutorial](https://www.toptal.com/python/python-class-attributes-an-overly-thorough-guide) to class attributes

### Validate Class Example

In [11]:
"""
In this example i'm gonna demonstrate the pythonic OOP way to validate instance atrributes
"""
class DataSetterTest:
    def __init__(self, name, number):
        self._name = self._number = None
        #set attributes use Setter method
        self.set_name(name)
        self.set_number(number)
    
    def set_name(self, name):
        if not type(name) == str: 
            raise ValueError("Invalid input , it must be a string") 
        else:
            self._name = name

    def set_number(self, val):
        if not ((type(val) == int ) or (type(val) == float)):
            print("Invalid input , it must be a float'")
        else:
            self._number = val
        
            
    #get attributes use Setter method
    def get_name(self):
        if self._name is None: return ('Attribute has not been set')
        else: return self._name
    
    def get_number(self):
        if self._number is None: return ('Attribute has not been set')
        else: return self._number
    

In [12]:
ins_ds = DataSetterTest('Mohamed', 1)
ins_ds.__dict__

{'_name': 'Mohamed', '_number': 1}

In [13]:
ins_ds.set_number(7)

In [14]:
ins_ds.__dict__

{'_name': 'Mohamed', '_number': 7}

In [15]:
"""
In this example i'm gonna demonstrate the pythonic OOP way to validate instance atrributes
[Use toturial](https://www.datacamp.com/community/tutorials/property-getters-setters)
"""
class DataSetterTest:
    def __init__(self, name, number):
        self.name = name
        self.number = number
    
    @property
    def name(self):
        if self._name == None : raise ValueError("Valid input, Empety set")
        return self._name
    
    @name.setter
    def name(self, str_val):
        if not (type(str_val) == str):
            raise ValueError("Invalid input , it must be a string!")
        self._name = str_val
    
    @property
    def number(self):
        if self._number == None : raise ValueError("Valid input, Empety set")
        return self._number
    
    @number.setter
    def number(self, val):
        if not ((type(val) == int ) or (type(val) == float)):
            raise ValueError("Invalid input , it must be a Number!")
        self._number = val


In [16]:
instance_DS = DataSetterTest("Mohamed", 11)
instance_DS.__dict__

{'_name': 'Mohamed', '_number': 11}

In [17]:
instance_DS.__dict__

{'_name': 'Mohamed', '_number': 11}

In [18]:

class Example(object):
    def __init__(self, nr1, nr2):
        self.a = nr1
        self.b = nr2

    def Add(self):
        c = self.a + self.b
        return c

In [19]:
ins_ex = Example(1, 2)
x = ins_ex.Add()

In [20]:
ins_ex.__dict__

{'a': 1, 'b': 2}

In [21]:
class Example(object):
    def __init__(self, nr1, nr2):
        self.a = nr1
        self.b = nr2

    def Add(self):
        self.c = self.a + self.b
        return self.c

In [22]:
ins_ex = Example(1, 2)
x = ins_ex.Add()
ins_ex.__dict__

{'a': 1, 'b': 2, 'c': 3}

### Class attributes vs instance attributes using both mutable and immutable objects

In [23]:
class ClassVar:

    global_immutable_class_variable = 45
    global_mutable_class_variable = [1, 2, 3]


    def __init__(self, mutable_parameter, immutable_parameter):
        self.mutable_instance_attribute = mutable_parameter
        self.immutable_instance_attribute = immutable_parameter
    
    def set_property(self, new_val):
        global_immutable_class_variable = new_val
        global_mutable_class_variable.append(new_val)
        self.immutable_instance_var = 0
        self.mutable_instance_var.append(new_val)
    
    def get_property(self):
        try:
            print("Out of the Excption")
            print("global immutable class variable ", global_immutable_class_variable)
            print("global mutable class variable ", global_mutable_class_variable)
        except:
            print("From Excption")
            print("global immutable class variable ", ClassVar.global_immutable_class_variable)
            print("global mutable class variable ", ClassVar.global_mutable_class_variable)
        print("immutable instance attribute ", self.immutable_instance_attribute)
        print("mutable instance attribute ", self.mutable_instance_attribute)

In [24]:
instance1 = ClassVar([10, 20, 30], 100)

In [25]:
instance1.get_property()

Out of the Excption
From Excption
global immutable class variable  45
global mutable class variable  [1, 2, 3]
immutable instance attribute  100
mutable instance attribute  [10, 20, 30]


In [26]:
instance1.__dict__

{'mutable_instance_attribute': [10, 20, 30],
 'immutable_instance_attribute': 100}

In [27]:
ClassVar.__dict__

mappingproxy({'__module__': '__main__',
              'global_immutable_class_variable': 45,
              'global_mutable_class_variable': [1, 2, 3],
              '__init__': <function __main__.ClassVar.__init__(self, mutable_parameter, immutable_parameter)>,
              'set_property': <function __main__.ClassVar.set_property(self, new_val)>,
              'get_property': <function __main__.ClassVar.get_property(self)>,
              '__dict__': <attribute '__dict__' of 'ClassVar' objects>,
              '__weakref__': <attribute '__weakref__' of 'ClassVar' objects>,
              '__doc__': None})

In [28]:
instance1.global_mutable_class_variable.append(40)

In [29]:
instance1.global_mutable_class_variable

[1, 2, 3, 40]

### Problem-2
Performance of Numpy Array vs Python List
```Python
y_list = tr_y.values.flatten().tolist()
sum([1 if y == 5 else 0 for y in y_list]) / len(y_list)
```
is faster than 
```
arr = tr_y.to_numpy()
sum([1 for i in arr if i == 5 ])/arr.__len__()
```



this happend because of that numpy has to wrap the returned object with a python type (e.g. numpy.float64 or numpy.int64 in this case) which takes time if you're iterating item-by-item1. Further proof of this is demonstrated when iterating -- We see that we're alternating between 2 separate IDs while iterating over the array. This means that python's memory allocator and garbage collector are working overtime to create new objects and then free them.

A list doesn't have this memory allocator/garbage collector overhead. The objects in the list already exist as python objects (and they'll still exist after iteration), so neither plays any role in the iteration over a list. 

[Stackoverflow reference](https://stackoverflow.com/questions/35232406/why-is-a-for-over-a-python-list-faster-than-over-a-numpy-array)

After some investigation 

Using NumPy like that is like dragging your car behind you by hand as you walk to the store - you're not actually using the power of the tools at your disposal.
```
np.sum(tr_y.labels.to_numpy()==5)/len(tr_y)
```


In [30]:
%timeit np.sum(tr_y.to_numpy()==5)/len(tr_y)

15.5 µs ± 230 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [31]:
%timeit sum([1 for i in tr_y.to_numpy().tolist() if i == 5 ])/arr.__len__()

NameError: name 'arr' is not defined

In [445]:
%timeit sum([1 for i in tr_y.values.flatten().tolist() if i == 5 ])/arr.__len__()

60.7 µs ± 1.97 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [446]:
%timeit sum([1 for i in tr_y.to_numpy() if i == 5 ])/arr.__len__()

1.03 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Check my [Question](https://stackoverflow.com/questions/66313883/performance-of-numpy-array-vs-python-list-over-1d-matrixvector?noredirect=1#comment117239143_66313883) on stackoverflow

### How i get indices of class using numpy
- Method using python 
```
positive_indices = [index for index, i in enumerate(y_list) if i == POSITIVE_LABEL]
```

In [475]:
%%timeit
y_list = tr_y.values.flatten().tolist()
positive_indices = [index for index, i in enumerate(y_list) if i == POSITIVE_LABEL]

89.2 µs ± 4.73 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [476]:
%%timeit
positive_indices_np = np.where(tr_y.to_numpy()==POSITIVE_LABEL)

17.2 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [481]:
pos_df = tr_x.iloc[positive_indices_np[0]]

In [482]:
pos_df.__len__()

500

500

In [510]:
mu = (pos_df.to_numpy()).mean(axis=0)
pos_ndarray = pos_df.to_numpy()
pos_ndarray - mu

array([[-4.846, -4.902,  0.72 , ..., -3.198, -5.002, 11.056],
       [-4.846, -4.902, -8.28 , ..., -6.198, -5.002, -4.944],
       [ 9.154,  4.098,  7.72 , ...,  1.802,  9.998,  7.056],
       ...,
       [-4.846, -4.902, -8.28 , ..., -6.198, -5.002, -4.944],
       [-4.846, -4.902, -8.28 , ..., -6.198, -5.002, -4.944],
       [-4.846, -4.902, -8.28 , ..., -6.198, -5.002, -4.944]])

In [493]:
tr_y

Int64Index([0], dtype='int64')

In [509]:
mu = (pos_df.to_numpy()).mean(axis=0)
mu.shape

(64,)

In [545]:
tr_y_2 = np.copy(tr_y.to_numpy())
tr_y_2[:100] = 5

In [549]:
np.sum(tr_y.to_numpy() == tr_y_2)

948