# Handlinng Numeric Data

In [5]:
import numpy as np

In [6]:
from sklearn import preprocessing

In [7]:
featur = np.array([[-500.5],
                 [-100.1],
                 [0],
                 [100.1],
                 [900.9]])

In [8]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

In [9]:
scaled_feature = minmax_scale.fit_transform(featur)

In [10]:
x=scaled_feature
x

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [11]:
scaler = preprocessing.StandardScaler()

In [12]:
standerdized = scaler.fit_transform(x)

In [13]:
standerdized

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])

In [14]:
round(standerdized.mean())

-0.0

In [15]:
standerdized.std()

1.0

In [17]:
robust_scalet = preprocessing.RobustScaler()

In [20]:
robust_scalet.fit_transform(x)

array([[-2.5],
       [-0.5],
       [ 0. ],
       [ 0.5],
       [ 4.5]])

In [21]:
import numpy as np
from sklearn.preprocessing import Normalizer

In [22]:
features = np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.4],[10.9,3.3]])

# Genrating polynomial and Interaction Feature

In [36]:
import numpy as np

In [37]:
from sklearn.preprocessing import PolynomialFeatures

In [38]:
features = np.array([[2,3],[2,3],[2,3]])

In [40]:
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)

In [42]:
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

# Transforming Feature

In [43]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

In [45]:
features = np.array([[2,3],[2,3],[2,3]])

In [49]:
def add_ten(x):
    return x+10

ten_transformer = FunctionTransformer(add_ten)

In [51]:
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [52]:
import pandas as pd

In [53]:
df = pd.DataFrame(features, columns=["feature_1","feature_2"])

In [54]:
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


# Detecting Outliers

In [55]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [58]:
features,_=make_blobs(n_samples=10,
                     n_features=2,
                     centers = 1,
                     random_state=1)

In [60]:
features[0,0] = 10000
features[0,1] = 10000

In [63]:
outlier_detector = EllipticEnvelope(contamination=.1)

In [65]:
outlier_detector.fit(features)

EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
                 store_precision=True, support_fraction=None)

In [66]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [67]:
import matplotlib.pyplot as plt

In [70]:
features.shape

(10, 2)

In [72]:
print(features)

[[ 1.00000000e+04  1.00000000e+04]
 [-2.76017908e+00  5.55121358e+00]
 [-1.61734616e+00  4.98930508e+00]
 [-5.25790464e-01  3.30659860e+00]
 [ 8.52518583e-02  3.64528297e+00]
 [-7.94152277e-01  2.10495117e+00]
 [-1.34052081e+00  4.15711949e+00]
 [-1.98197711e+00  4.02243551e+00]
 [-2.18773166e+00  3.33352125e+00]
 [-1.97451969e-01  2.34634916e+00]]


In [74]:
feature= features[:,0]

In [80]:
def indices_of_outliers(x):
    q1,q3=np.percentile(x,[25,75])
    iqr = q3-q1
    lower_bound = q1-(iqr*1.5)
    upper_bound = q3+(iqr*1.5)
    return np.where((x>upper_bound)|(x<lower_bound))

In [81]:
indices_of_outliers(feature)

(array([0], dtype=int64),)

# Handling outlier

In [82]:
houses = pd.DataFrame() 
houses['Price'] = [534433, 392333, 293222, 4322032] 
houses['Bathrooms'] = [2, 3.5, 2, 116] 
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

In [83]:
houses["outlier"]=np.where(houses["Bathrooms"]<20,0,1)

In [84]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [86]:
#transform the outlier
houses["Log_of_Square_Feet"]=[np.log(x) for x in houses["Square_Feet"]]

In [87]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


#  Discretizating Features 

In [88]:
from sklearn.preprocessing import Binarizer

In [89]:
age = np.array([[6],
               [12],
               [20],
               [36],
               [65]])

In [90]:
binarizer = Binarizer(18)

In [92]:
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [93]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

#  Grouping Observations Using Clustering

In [97]:
from sklearn.cluster import KMeans

In [99]:
features,_=make_blobs(n_samples=50,
                     n_features=2,
                     centers=3,
                     random_state=1)

In [100]:
dataframe = pd.DataFrame(features, columns=["feature_1","feature_2"])

In [106]:
clusterer = KMeans(3,random_state=0)

In [107]:
clusterer.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [117]:
dataframe["group"] = clusterer.predict(features)

In [118]:
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [119]:
dataframe

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,2
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


In [120]:
features = np.array([[1.1,11.1],
                    [2.2,22.2],
                    [3.3,33.3],
                    [4.4,44.4],
                    [np.nan,55]])

In [127]:
features[~np.isnan(feature).any(axis=0)]

array([[[ 1.1, 11.1],
        [ 2.2, 22.2],
        [ 3.3, 33.3],
        [ 4.4, 44.4],
        [ nan, 55. ]]])