### Introduction
Machine Learning - Software that can learn from examples & not by rules

* Supervised Machine Learning - Training with features & labels

* Unsupervised Machine Learning - Finding relationship among data. This doesn't reply on labled data

### Supervised Machine Learning
* Regression - Output is continues in nature. Predicting house price, predicting sales, cardekho.com
* Classification - Output is categorical or discrete . Mail spam or not, tumor cancerous or not

### Preprocessing Data

In [1]:
import numpy as np 
from sklearn import preprocessing 

In [2]:
input_data = np.array([[5.1, -2.9, 3.3], 
                       [-1.2, 7.8, -6.1], 
                       [3.9, 0.4, 2.1], 
                       [7.3, -9.9, -4.5]]) 

In [3]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

### Binerization 
Convert into boolean values

### Mean removal
### Scaling
### Normalization 

In [14]:
d = preprocessing.Binarizer(threshold=5.1).transform(input_data)

In [15]:
input_data[1]

array([-1.2,  7.8, -6.1])

In [16]:
d[1,1]

1.0

In [17]:
d

array([[ 0.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  0.],
       [ 1.,  0.,  0.]])

### Mean removal
Remove mean from the feature vector

In [18]:
data_scaled = preprocessing.scale(input_data)

In [19]:
data_scaled

array([[ 0.42462551, -0.2748757 ,  1.13244172],
       [-1.59434861,  1.40579288, -1.18167831],
       [ 0.04005901,  0.24346134,  0.83702214],
       [ 1.12966409, -1.37437851, -0.78778554]])

In [20]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

In [22]:
data_scaled.mean(axis=1)

array([ 0.42739717, -0.45674468,  0.37351416, -0.34416666])

In [24]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

###  Scaling
* Min max scaling

In [36]:
min_max_scalar = preprocessing.MinMaxScaler(feature_range=(-5,-1))

In [37]:
min_max_scalar.fit_transform(input_data)

array([[-2.03529412, -3.4180791 , -1.        ],
       [-5.        , -1.        , -5.        ],
       [-2.6       , -2.67231638, -1.5106383 ],
       [-1.        , -5.        , -4.31914894]])

In [33]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

### Normalization

In [38]:
preprocessing.normalize(input_data, norm='l1')

array([[ 0.45132743, -0.25663717,  0.2920354 ],
       [-0.0794702 ,  0.51655629, -0.40397351],
       [ 0.609375  ,  0.0625    ,  0.328125  ],
       [ 0.33640553, -0.4562212 , -0.20737327]])

In [39]:
preprocessing.normalize(input_data, norm='l2')

array([[ 0.75765788, -0.43082507,  0.49024922],
       [-0.12030718,  0.78199664, -0.61156148],
       [ 0.87690281,  0.08993875,  0.47217844],
       [ 0.55734935, -0.75585734, -0.34357152]])

## Label Encoding 

In [40]:
import numpy as np

In [47]:
input_labels = ['green', 'black', 'yellow', 'red', 'black', 'red', 'white'] 

In [48]:
encoder = preprocessing.LabelEncoder()

In [49]:
encoder.fit_transform(input_labels)

array([1, 0, 4, 2, 0, 2, 3], dtype=int64)

In [50]:
encoder.classes_

array(['black', 'green', 'red', 'white', 'yellow'],
      dtype='<U6')

In [51]:
encoder.transform(['black'])

array([0], dtype=int64)

In [None]:
input_labels

### Logistic Regression Classifier

In [52]:
import numpy as np 
from sklearn import linear_model  
import matplotlib.pyplot as plt

In [53]:
X = np.array([[3.1, 7.2], [4, 6.7], [2.9, 8], [5.1, 4.5], [6, 5], [5.6, 5], [3.3, 0.4], [3.9, 0.9], [2.8, 1], [0.5, 3.4], [1, 4], [0.6, 4.9]]) 
y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 

In [55]:
lg = linear_model.LogisticRegression()

In [57]:
lg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
lg.predict([[3.4, 0.7]])

array([2])

In [65]:
from sklearn.datasets import load_iris

In [66]:
iris = load_iris()

In [68]:
iris.data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [76]:
iris.target.shape

(150,)

In [72]:
lg = linear_model.LogisticRegression()

In [77]:
lg.fit(iris.data[:145], iris.target[:145])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
lg.predict([[ 5.9,  3. ,  5.1,  1.8]])

array([2])

In [79]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
import 