### Describtion
- In this Notebook i'm going to demonstrate how to design an abstracted model to fit data using Linear Gaussian Mixture and Logistic Regression Models.
- I will note how different systax or tools affect the speed of code.
- I will demonstrate how to maximize training and testing accuracy.
- #### *Outlines*:
    - [Import Libraries](#import-libraries)


### Import Libraries

In [4]:
import numpy as np 
import pandas as pd 
import time
from tqdm import tqdm_notebook
from abc import ABC, abstractmethod

In [25]:
#DataFetcher
class DataFetcher:
    """
    Parse training and testing data from specific dirctory
    """
    def __init__(self, directory, data_name, labels_name):
        self.directory = directory
        self.data_name = data_name
        self.labels_name = labels_name

    #Validate paramters 
    @property
    def directory(self):
        self._directory
    
    @directory.setter
    def directory(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._directory = string

    @property
    def data_name(self):
        self._data_name
    
    @data_name.setter
    def data_name(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._data_name = string

    
    @property
    def labels_name(self):
        self._labels_name
    
    @labels_name.setter
    def labels_name(self, string):
        if type(string) != str:
            raise ValueError("Invalid Input: Input must be string!")
        else:
            self._labels_name = string
            
    #get directory
    def _get_training_data_path(self, subset_number):
        return "./%s/train%s%d.csv" %(self._directory, self._data_name, subset_number+1)
    
    def _get_training_labels_path(self, subset_number):
        return "./%s/train%s%d.csv" %(self._directory, self._labels_name, subset_number+1)
 
    
    def get_all_training_data(self):
        training_data_dfs = []
        training_labels_dfs = []

        for subset in range(SUBSETS):
            data_path = self._get_training_data_path(subset)
            labels_path = self._get_training_labels_path(subset)

            training_data_dfs.append(pd.read_csv(data_path, header=None))
            training_labels_dfs.append(pd.read_csv(labels_path, header=None))

        return training_data_dfs, training_labels_dfs
    
    def get_all_testing_data(self):

        data_path = "./%s/test%s.csv" %(self._directory, self._data_name)
        labels_path = "./%s/test%s.csv" %(self._directory, self._labels_name)

        testing_data_dfs = pd.read_csv(data_path, header=None)
        testing_labels_dfs = pd.read_csv(labels_path, header=None)

        return testing_data_dfs, testing_labels_dfs
        
    


In [26]:
directory = 'knn-dataset'
name_x = 'Data'
name_y = 'Labels'
SUBSETS = 10
DF = DataFetcher(directory, name_x, name_y)

In [27]:
testing_X, testing_Y = DF.get_all_testing_data()

In [52]:
class CrossValidation:
    """
    para*:
        data_dfs : list of folds*dataframe, each data frame is batch of m exambles
        labels_df : Dataframe contains sigle col which is labels
    """
    def __init__(self, data_dfs, labels_dfs):
        self.data_dfs = data_dfs
        self.labels_dfs = labels_dfs
        self.num_subsets = len(data_dfs)        

    
    def _get_cross_validation(self, subset):
        validation_x_set = self.data_dfs[subset]
        validation_y_set = self.labels_dfs[subset]

        training_x_set = self.data_dfs[:subset]+self.data_dfs[subset+1:]
        training_y_set = self.labels_dfs[:subset]+self.data_dfs[subset+1:]
        return training_x_set, training_y_set, validation_x_set, validation_y_set
    
    def get_training_validation(self, subset):
        training_x, training_y, validation_x, validation_y = self._get_cross_validation(subset)

        training_x = pd.concat(training_x, ignore_index=True)
        training_y = pd.concat(training_y, ignore_index=True)

        return [training_x, training_y, validation_x, validation_y]
    
    def get_training(self):
        training_x_set = pd.concat(self.data_dfs[:], ignore_index=True)
        training_y_set = pd.concat(self.labels_dfs[:], ignore_index=True)
        return [training_x_set, training_y_set]


In [62]:
#Optimized version of Cross validation
class CrossValidation_V2:
    """
    para*:
        data_dfs : list of folds*dataframe, each data frame is batch of m exambles
        labels_df : Dataframe contains sigle col which is labels
    """
    def __init__(self, data_dfs, labels_dfs):
        self.X = data_dfs
        self.Y = labels_dfs
        self.num_subsets = len(data_dfs)        

    #split data
    def _split_training_validation(self, dfs, subset):
        validation_df = dfs[subset]
        training_dfs = pd.concat(dfs[:subset] + dfs[subset+1:], ignore_index=True)
        return [training_dfs, validation_df] 
    
    def _get_x_training_validation(self, subset):
        [training_X, validation_X] = self._split_training_validation(self.X, subset)
        return [training_X, validation_X]

    def _get_y_training_validation(self, subset):
        [training_Y, validation_Y] = self._split_training_validation(self.Y, subset)
        return [training_Y, validation_Y]

    
    def get_training_validation(self, subset):
        [training_x, validation_x] = self._get_x_training_validation(subset)
        [training_y, validation_y] = self._get_y_training_validation(subset)
        return [training_x, training_y, validation_x, validation_y]
        

        return [training_x, training_y, validation_x, validation_y]
    
    def get_training(self):
        training_x_set = pd.concat(self.data_dfs[:], ignore_index=True)
        training_y_set = pd.concat(self.labels_dfs[:], ignore_index=True)
        return [training_x_set, training_y_set]

In [63]:
training_X, training_Y = DF.get_all_training_data()

In [65]:
CV = CrossValidation_V2(training_X, training_Y)

In [66]:
tr_x, tr_y, tst_x, tst_y = CV.get_training_validation(3)

In [71]:
class Model(ABC):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    
    @abstractmethod
    def predict(self, x):
        pass
    
    def predict_df(self, x_df):
        predictios = x_df.apply(lambda row: self.predict(row), raw=True, axis=1)

In [72]:
def get_accuracy(true_labels, predicted_labels):
    assert(len(true_labels) == len(predicted_labels))
    return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)

In [None]:
class GaussianMixture(Model):
    def __init__(self, training_x, training_y, POSITIVE_LABEL, NEGATIVE_LABEL):
        super().__init__(training_x, training_y)
        self.POSITIVE_LABEL = POSITIVE_LABEL
        self.NEGATIVE_LABEL = NEGATIVE_LABEL

        #pi, mu_1, mu_2, s1, s2, cov, cov_inv, w0, w1
        y_list = training_Y.values.flatten().tolist()
        pi = sum([1 if  i == self.POSITIVE_LABEL  else 0 for i in y_list 0])/len(y_list)

        #N1(number of points for class 1)
        #N2(number of points for class 2)
        #N(number of points for both classes)
        positive_indices = [index for index, i in enumerate(y_list) if i == self.POSITIVE_LABEL]
        negative_indices = [index for index, i in enumerate(y_list) if i == self.NEGATIVE_LABEL]

        N1 = len(positive_indices)
        N2 = len(negative_indices)
        N = N1 + N2

        positive_x_df = training_x.iloc(positive_indices)
        negative_x_df = training_x.iloc(negative_indices)

        mu1 = positive_x_df.mean()
        mu2 = negative_x_df.mean()

        positive_x = positive_x_df.to_numpy()
        negative_x = negative_x_df.to_numpy()

        positive_x_dists = positive_x - mu1
        negative_x_dists = negative_x - mu2

        s1 = positive_x_dists.T.dot(positive_x_dists)/N1
        s1 = negative_x_dists.T.dot(negative_x_dists)/N2

        cov = ((N1/N) * s1) + ((N2/N) * s2)
        cov_inv = np.linalg.inv(cov)

        w = cov_inv.dot(mu1 - mu2)
        w0 =  -(1/2) * mu1.T.dot(cov_inv).dot(mu1) + (1/2) * mu2.T.dot(cov_inv).dot(mu2) + np.log(pi/ (1-pi))

        self.pi = pi 
        self.mu1 = mu1
        self.mu2 = mu2 
        self.N1 = N1
        self.N2 = N2
        self.cov = cov
        self.cov_inv = cov_inv 
        self.w = w 
        self.w0 = w0 

    def predict_prob(self, x):
        logits = self.w.dot(x) + self.w0
        prob = 1 / 1 + np.exp(-logits)
        return prob
    
    def predict(self, x):
        prob = self.predict_prob(x)
        if prob > 0.5:
            return self.POSITIVE_LABEL
        else:
            return self.NEGATIVE_LABEL








In [186]:
tr_x, tr_y, tst_x, tst_y = CV.get_training_validation(3)
c1_x = [index for index, i in enumerate(tr_y.to_numpy()) if i == 5]
c2_x = [index for index, i in enumerate(tr_y.to_numpy()) if i == 6]
pd.iloc(positive_indices)

In [117]:
%%timeit 
arr = tr_y.to_numpy()
sum([1 for i in arr if i == 5 ])/arr.__len__()


871 µs ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [114]:
%%timeit 
y_list = tr_y.values.flatten().tolist()
sum([1 if y == 5 else 0 for y in y_list]) / len(y_list)


102 µs ± 1.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


900

#### Problem-1: Validate class parameters to meet some limitation
- We will use ```@property``` decorator; a pythonic way to use getters and setters in object-oriented programming.
- Look at [Validate Class Example](#validate-class-example).
- #### References:
    - [Here]()
    - A good [tutorial](https://www.toptal.com/python/python-class-attributes-an-overly-thorough-guide) to class attributes

### Validate Class Example

In [112]:
"""
In this example i'm gonna demonstrate the pythonic OOP way to validate instance atrributes
"""
class DataSetterTest:
    def __init__(self, name, number):
        self._name = self._number = None
        #set attributes use Setter method
        self.set_name(name)
        self.set_number(number)
    
    def set_name(self, name):
        if not type(name) == str: 
            raise ValueError("Invalid input , it must be a string") 
        else:
            self._name = name

    def set_number(self, val):
        if not ((type(val) == int ) or (type(val) == float)):
            print("Invalid input , it must be a float'")
        else:
            self._number = val
        
            
    #get attributes use Setter method
    def get_name(self):
        if self._name is None: return ('Attribute has not been set')
        else: return self._name
    
    def get_number(self):
        if self._number is None: return ('Attribute has not been set')
        else: return self._number
    

In [121]:
ins_ds = DataSetterTest('Mohamed', 1)
ins_ds.__dict__

{'_name': 'Mohamed', '_number': 1}

In [122]:
ins_ds.set_number(7)

In [123]:
ins_ds.__dict__

{'_name': 'Mohamed', '_number': 7}

In [179]:
"""
In this example i'm gonna demonstrate the pythonic OOP way to validate instance atrributes
[Use toturial](https://www.datacamp.com/community/tutorials/property-getters-setters)
"""
class DataSetterTest:
    def __init__(self, name, number):
        self.name = name
        self.number = number
    
    @property
    def name(self):
        if self._name == None : raise ValueError("Valid input, Empety set")
        return self._name
    
    @name.setter
    def name(self, str_val):
        if not (type(str_val) == str):
            raise ValueError("Invalid input , it must be a string!")
        self._name = str_val
    
    @property
    def number(self):
        if self._number == None : raise ValueError("Valid input, Empety set")
        return self._number
    
    @number.setter
    def number(self, val):
        if not ((type(val) == int ) or (type(val) == float)):
            raise ValueError("Invalid input , it must be a Number!")
        self._number = val


In [181]:
instance_DS = DataSetterTest("Mohamed", 11)
instance_DS.__dict__

{'_name': 'Mohamed', '_number': 11}

In [177]:
instance_DS.__dict__

{'_name': 'Mohamed', '_number': 11}

In [29]:

class Example(object):
    def __init__(self, nr1, nr2):
        self.a = nr1
        self.b = nr2

    def Add(self):
        c = self.a + self.b
        return c

In [31]:
ins_ex = Example(1, 2)
x = ins_ex.Add()

In [33]:
ins_ex.__dict__

{'a': 1, 'b': 2}

In [36]:
class Example(object):
    def __init__(self, nr1, nr2):
        self.a = nr1
        self.b = nr2

    def Add(self):
        self.c = self.a + self.b
        return self.c

In [37]:
ins_ex = Example(1, 2)
x = ins_ex.Add()
ins_ex.__dict__

{'a': 1, 'b': 2, 'c': 3}

### Class attributes vs instance attributes using both mutable and immutable objects

In [10]:
class ClassVar:

    global_immutable_class_variable = 45
    global_mutable_class_variable = [1, 2, 3]


    def __init__(self, mutable_parameter, immutable_parameter):
        self.mutable_instance_attribute = mutable_parameter
        self.immutable_instance_attribute = immutable_parameter
    
    def set_property(self, new_val):
        global_immutable_class_variable = new_val
        global_mutable_class_variable.append(new_val)
        self.immutable_instance_var = 0
        self.mutable_instance_var.append(new_val)
    
    def get_property(self):
        try:
            print("Out of the Excption")
            print("global immutable class variable ", global_immutable_class_variable)
            print("global mutable class variable ", global_mutable_class_variable)
        except:
            print("From Excption")
            print("global immutable class variable ", ClassVar.global_immutable_class_variable)
            print("global mutable class variable ", ClassVar.global_mutable_class_variable)
        print("immutable instance attribute ", self.immutable_instance_attribute)
        print("mutable instance attribute ", self.mutable_instance_attribute)

In [11]:
instance1 = ClassVar([10, 20, 30], 100)

In [12]:
instance1.get_property()

From Excption
global immutable class variable  45
global mutable class variable  [1, 2, 3]
immutable instance attribute  100
mutable instance attribute  [10, 20, 30]


In [69]:
instance1.__dict__

{'mutable_instance_attribute': [10, 20, 30],
 'immutable_instance_attribute': 100}

In [70]:
ClassVar.__dict__

mappingproxy({'__module__': '__main__',
              'global_immutable_class_variable': 45,
              'global_mutable_class_variable': [1, 2, 3],
              '__init__': <function __main__.ClassVar.__init__(self, mutable_parameter, immutable_parameter)>,
              'set_property': <function __main__.ClassVar.set_property(self, new_val)>,
              'get_property': <function __main__.ClassVar.get_property(self)>,
              '__dict__': <attribute '__dict__' of 'ClassVar' objects>,
              '__weakref__': <attribute '__weakref__' of 'ClassVar' objects>,
              '__doc__': None})

In [15]:
instance1.global_mutable_class_variable.append(40)

In [16]:
instance1.global_mutable_class_variable

[1, 2, 3, 40, 40]

### Problem-2
Performance of Numpy Array vs Python List
```Python
y_list = tr_y.values.flatten().tolist()
sum([1 if y == 5 else 0 for y in y_list]) / len(y_list)
```
is faster than 
```
arr = tr_y.to_numpy()
sum([1 for i in arr if i == 5 ])/arr.__len__()
```

this happend because of that numpy has to wrap the returned object with a python type (e.g. numpy.float64 or numpy.int64 in this case) which takes time if you're iterating item-by-item1. Further proof of this is demonstrated when iterating -- We see that we're alternating between 2 separate IDs while iterating over the array. This means that python's memory allocator and garbage collector are working overtime to create new objects and then free them.

A list doesn't have this memory allocator/garbage collector overhead. The objects in the list already exist as python objects (and they'll still exist after iteration), so neither plays any role in the iteration over a list. 

[Stackoverflow reference](https://stackoverflow.com/questions/35232406/why-is-a-for-over-a-python-list-faster-than-over-a-numpy-array)


In [128]:
%%timeit
y_list = tr_y.values.flatten().tolist()

12.3 µs ± 193 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [129]:
%%timeit
arr = tr_y.to_numpy()

4.55 µs ± 92.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [131]:
y_list = tr_y.values.flatten().tolist()
arr = tr_y.to_numpy()

In [132]:
%%timeit
sum([1 if y == 5 else 0 for y in y_list]) / len(y_list)

54.9 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [133]:
%%timeit
sum([1 for i in arr if i == 5 ])/arr.__len__()

883 µs ± 48 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [134]:
%timeit a = [[2,3,5],[3,6,2],[1,3,2]]

131 ns ± 2.69 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [135]:
%timeit a = np.array([[2,3,5],[3,6,2],[1,3,2]])

3.89 µs ± 50.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [137]:
a = [[2,3,5],[3,6,2],[1,3,2]]
b = np.array([[2,3,5],[3,6,2],[1,3,2]])

In [140]:
%timeit [i for i in a]

234 ns ± 9.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [141]:
%timeit [i for i in b]

1.01 µs ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [156]:
a = np.arange(32)


In [157]:
b = list([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])


In [158]:
id(a)

2193455078096

In [160]:
for i in range(a.__len__()):
    print(id(a[i]))


2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462456
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960
2193439462960


In [162]:
id(b)

2193458703240

In [161]:
for i in range(b.__len__()):
    print(id(b[i]))

1672492672
1672492704
1672492736
1672492768
1672492800
1672492832
1672492864
1672492896
1672492928
1672492960
1672492992
1672493024
1672493056
1672493088
1672493120
1672493152
1672493184
1672493216
1672493248
1672493280
1672493312
1672493344
1672493376
1672493408
1672493440
1672493472
1672493504
1672493536
1672493568
1672493600
1672493632
1672493664


In [179]:
sys.getsizeof(b)

400

In [189]:
#mathimatical operations timing on list vs numpy
%timeit a-a

456 ns ± 10.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
