# Chapter 4: Handling Numerical Data
* scaling的方法：min-max、standard
## 4.1 rescaling a feature: rescale the values of a numerical feature to be between two values.（适合神经网络）

In [7]:
# 采用的是 min-max scaling方法
import numpy as np
from sklearn import preprocessing
#feature = np.array([-500.5,-100.1,0.0,100.1,500.5]) # 需要输入 2D-array
feature = np.array([
    [-500.5],
    [-100.1],
    [0.0],
    [100.1],
    [900.5]
])

# create scaler
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

# Scale feature
scaled_feature=minmax_scale.fit_transform(feature)
print(scaled_feature)
print(feature)

[[0.        ]
 [0.28579586]
 [0.35724483]
 [0.42869379]
 [1.        ]]
[[-500.5]
 [-100.1]
 [   0. ]
 [ 100.1]
 [ 900.5]]


## 4.2 standardizing a feature: transform a feature to have a mean of 0 and a standard deviation of 1.（适合主成分分析）
* 注意：当存在异常值时，建议 suing the median and quantile range(取代均值和标准差)

In [18]:
import numpy as np 
from sklearn import preprocessing
x=np.array((
[-1000.1],
[-200.1],
[500.5],
[600.6],
[1300.3]))

# create a scaler
scaler=preprocessing.StandardScaler()

# fit & transform
standardized=scaler.fit_transform(x)
standardized

array([[-1.58713949],
       [-0.56345921],
       [ 0.33302878],
       [ 0.46111678],
       [ 1.35645314]])

In [12]:
# create scaler
robust_scaler=preprocessing.RobustScaler()
# transform
robust_scaler.fit_transform(x)

array([[-1.87411015],
       [-0.87498439],
       [ 0.        ],
       [ 0.12501561],
       [ 0.99887598]])

## 4.3 Normalizing Observation: rescale the feature values of observations to have unit norm(a total length of 1)

In [17]:
import numpy as np
from sklearn.preprocessing import Normalizer
features=np.array([
    [-0.5,0.5],
    [1.5,20.2],
    [1.63,34.4],
    [10.9,3.3]
])

# create normalizer
normalizer=Normalizer(norm="l2") # "2"表示欧几里得范数中的次数是2

# transform
normalizer.transform(features) # 每一行的norm为1

array([[-0.5       ,  0.5       ],
       [ 0.06912442,  0.93087558],
       [ 0.04524008,  0.95475992],
       [ 0.76760563,  0.23239437]])

## 4.4 _Generating_ Polynomial and Interaction Features

In [21]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
features = np.array([[2, 3], [4, 5], [6, 7]])

# create PolynomialFeatures object
polynomial_interaction= PolynomialFeatures(degree=2,include_bias=False) # 2是多项式的最高次

# create polynomial features
polynomial_interaction.fit_transform(features) # 两个一次项，两个平方项以及一个交叉项

array([[ 2.,  3.,  4.,  6.,  9.],
       [ 4.,  5., 16., 20., 25.],
       [ 6.,  7., 36., 42., 49.]])

In [22]:
# create PolynomialFeatures object with only interaction features by setting "interaction_only" to True
interaction = PolynomialFeatures(degree=2, 
                                 interaction_only=True, 
                                 include_bias=False) 
interaction.fit_transform(features)

array([[ 2.,  3.,  6.],
       [ 4.,  5., 20.],
       [ 6.,  7., 42.]])

## 4.5 Transforming Features: make a custom transformation to one or more features

In [24]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
features = np.array([[2, 3], [4, 5], [6, 7]])

# Define a simple function
def add_ten(x):
    return x+10

# create transformer
ten_transformer=FunctionTransformer(add_ten)

ten_transformer.transform(features)

array([[12, 13],
       [14, 15],
       [16, 17]])

In [26]:
# we can use the same transfromation in pandas using apply
import pandas as pd
df=pd.DataFrame(features,columns=["feature1","feature2"])
# apply function
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,14,15
2,16,17


## 4.6 Detecting Outliers: identify extreme observations
* 方法一：EllipticEnvelope-> 此方法的一个局限性在于：需要实现设定一个 contamination(污染)参数，即观测值中outlier的比例
* 方法二：IQR-based detection

In [31]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# create simuylated data
features,_=make_blobs(n_samples=10,n_features=2,centers=1,random_state=1)

# replace the first observation's values with extreme values
features[0,0]=10000
features[1,0]=10000

# create detector
outlier_detector=EllipticEnvelope(contamination=.1)

# fit detector
outlier_detector.fit(features)

# predict outliers
print(features)
outlier_detector.predict(features) # 1表示 inlier，-1 表示outlier

[[ 1.00000000e+04  3.52863145e+00]
 [ 1.00000000e+04  5.55121358e+00]
 [-1.61734616e+00  4.98930508e+00]
 [-5.25790464e-01  3.30659860e+00]
 [ 8.52518583e-02  3.64528297e+00]
 [-7.94152277e-01  2.10495117e+00]
 [-1.34052081e+00  4.15711949e+00]
 [-1.98197711e+00  4.02243551e+00]
 [-2.18773166e+00  3.33352125e+00]
 [-1.97451969e-01  2.34634916e+00]]


array([ 1, -1,  1,  1,  1,  1,  1,  1,  1,  1])

In [36]:
# using interquartile range to identify extreme values
feature=features[:,0]

# create a function to return index of outliers
def indices_of_outliers(x):
    q1,q3=np.percentile(x,[25,75])
    iqr=q3-q1
    lower_bound=q1-iqr*1.5
    upper_bound=q3+iqr*1.5
    return np.where((x>upper_bound) | (x< lower_bound))

print(feature)
indices_of_outliers(feature)

[ 1.00000000e+04  1.00000000e+04 -1.61734616e+00 -5.25790464e-01
  8.52518583e-02 -7.94152277e-01 -1.34052081e+00 -1.98197711e+00
 -2.18773166e+00 -1.97451969e-01]


(array([0, 1]),)

## 4.7 Handling Outliers

In [40]:
import pandas as pd
houses = pd.DataFrame()
houses['Price']=[524433,392333,293222,4322032]
houses['Bathrooms']=[2,3.5,2,116]
houses['Square_Feet']=[1500,2500,1500,48000]

#Filter observations
houses[houses['Bathrooms']<20]

# 接下来可以 mark them as outliers and include it as a feature

import numpy as np
# create feature based on boolean condition
houses["outlier"]=np.where(houses['Bathrooms']<20,0,1)
houses

# 最后，we can transfrom the feature to dampen the effect of the outlier(取对数)
houses['log_of_square_feet']=[np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier,log_of_square_feet
0,524433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 4.8 Discretizating Features：have a numerical feature and want to break it up into discrete bins
* 方法一：binarize the feature according to some threshold(单一的threshold)
* 方法二：..................................multiple ........

* Tips: Discretization can be a fruitful strategy when we have reason to believe that a numerical feature should behave more like a categorical feature.

In [5]:
import numpy as np
from sklearn.preprocessing import Binarizer
age = np.array([[6], [12], [20], [36], [65]]) 
 
# create binarizer
binarizer=Binarizer(36)

#transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [0],
       [0],
       [1]])

In [8]:
# break up numerical features according to multiple threshold:
np.digitize(age,bins=[20,30,64]) # 每一个数字代表bin的right edge（不包含在内）
# 也可以通过设置参数 “right=True”来包含右边界
np.digitize(age,bins=[20,30,64],right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

## 4.9 Grouping Observations Using Clustering: cluster observations so that similar observations are grouped together

In [9]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# make simulated feature matrix
features,_=make_blobs(n_samples=50,
                     n_features=2,
                     centers=3,
                     random_state=1)
# create dataframe
df=pd.DataFrame(features,columns=["feature1","feature2"])
# make k-means clusterer
clusterer=KMeans(3,random_state=0)
# fit clusterer
clusterer.fit(features)
# predict values
df['group']=clusterer.predict(features)
df

Unnamed: 0,feature1,feature2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,0
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


## 4.10 Deleting Observations with Missing Values
* 许多 machine learning algorithms 无法处理 missing values，所以需要在preprocessing阶段解决missing value的问题
* 有三种类型的missing values：missing completely at random、missing at random、missing not at random。如果是最后一种情况，那么deleting missing values 可能会引入bias

In [11]:
import numpy as np
features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])
# keep only observations that are not missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [12]:
# alternatively, we can drop missing observations using pandas
import pandas as pd
df=pd.DataFrame(features, columns=["feature1","feature2"])
# remove observations with missing values
df.dropna()


Unnamed: 0,feature1,feature2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## 4.10 imputing missing values: you have missing values in your data and want to fill in or predict their values
* 一般来说，有两种策略可以用于填补missing values，方法一是使用machine learning取predict，例如KNN；另一种就是简单地使用某个平均值

In [5]:
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

features,_=make_blobs(n_samples=1000,
                     n_features=2,
                     random_state=1)

# standardize the features
scaler=StandardScaler()
standardized_features=scaler.fit_transform(features)

# replace the first feature's first values with a missing value
true_value=standardized_features[0,0]
standardized_features[0,0]=np.nan

# predict the missing values in the feature matrix
features_knn_imputed=KNN(k=5,verbose=0).complete(standardized_features)

# compare true and imputed values
print('True value:',true_value)
print("imputed value",features_knn_imputed[0,0])

True value: 0.8730186113995938
imputed value 1.0955332713113226


In [6]:
# Alternatively, we can use scikit-learn's Imputer module to fill in missing values with the feature's mean, median, or most frequent value.
# However, we will typically get worse results than KNN:

from sklearn.preprocessing import Imputer
# create imputer
mean_imputer=Imputer(strategy='mean',axis=0)
# impute value
features_mean_imputed=mean_imputer.fit_transform(features)
# compare true and imputed values
print('True value:',true_value)
print("imputed value",features_mean_imputed[0,0])

True value: 0.8730186113995938
imputed value -3.058372724614996


# Chapter 5 Handling Categorical Data

## 5.1 Encoding Nominal Categorical Features: have a feature with nominal calssed that has no intrinsic ordering

In [10]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])
# create one-hot encoder
one_hot=LabelBinarizer()
# one_hot encode feature
one_hot.fit_transform(feature) # 从结果来看，原理应该是将其影射在多维空间上，有多少种类，就有多少维度
# if we want to reverse the one-hot encoding, we can use "inverse_transform"
one_hot.inverse_transform(one_hot.fit_transform(feature))
# we can use the "classes_" method to output the classes
one_hot.classes_

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [11]:
# we can even use pandas to one-hot encode the feature
import pandas as pd
# create dummy variable from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [13]:
# One helpful ability of scikit-learn is to handle a situation where each observation lists multiple classes, 例如
multiclass_feature = [("Texas", "Florida"), 
                      ("California", "Alabama"), 
                      ("Texas", "Florida"),
                      ("Delware", "Florida"),
                      ("Texas", "Alabama")] 
# create multiclass one-hot encoder
one_hot_multiclass=MultiLabelBinarizer()
#one-hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)
# view classes
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

## 5.2 Encoding Ordinal Categorical Features

In [15]:
import pandas as pd
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})
# create mapper
scale_mapper = {"Low":1,
                "Medium":2,
                "High":3} # 注意，有时候数值选择很重要；可以选择小数
# replace feature values with scale
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

## 5.3 Encoding Dictionaries of Features: you have a dictionary and want to convert it into a feature matrix
* 需要额外关注 dictvectorizer

In [18]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]
# create dictionary vectorizer
dictvectorizer=DictVectorizer(sparse=False)
# convert dictionary to feature matirx
features=dictvectorizer.fit_transform(data_dict)
pd.DataFrame(features,columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [19]:
#We can get the names of each generated feature using the get_feature_names method:

feature_names = dictvectorizer.get_feature_names() 
# View feature names 
feature_names

['Blue', 'Red', 'Yellow']

## 5.4 Imputing Missing Class Values: you have a categorical feature missing value that you want to replace with predicted values

In [23]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# Create feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])
# train KNN learner
clf=KNeighborsClassifier(3,weights="distance")
trained_model=clf.fit(X[:,1:],X[:,0])
# predict missing values' class
imputed_values=trained_model.predict(X_with_nan[:,1:])
# join column of predicted class with their other features
X_with_imputed=np.hstack((imputed_values.reshape(-1,1),X_with_nan[:,1:]))
# join two feature matrices
np.vstack((X_with_imputed,X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [24]:
# An alternative solution is to fill in missing values with the feature’s most frequent value:

from sklearn.preprocessing import Imputer
# join the two feature matrices
X_complete=np.vstack((X_with_nan,X))
imputer=Imputer(strategy="most_frequent",axis=0)
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 Handling Imbalanced Classes: have a vector with highly imbalanced classes

In [26]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# load iris data
iris= load_iris()

# create feature matirx
features=iris.data
# create target vector
target=iris.target
# remove first 40 observations
features=features[40:,:]
target=target[40:]
# create binary target vector indicating if class 0
target=np.where((target==0),0,1)
# look at the imbalanced target vector
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [29]:
# Many algorithms in scikit-learn offer a parameter to weight classes during training to counteract the effect of their imbalance.

weights={0:0.9,1:0.1}
# create random forest classifier with weights
RandomForestClassifier(class_weight=weights)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [30]:
# Or you can pass balanced, which automatically creates weights inversely proportional to class frequencies: 

# train a random foprest with balanced class weights
RandomForestClassifier(class_weight="balanced")

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Chapter 6 Handling Text
## 6.1 Cleaning Text: complete some basic cleaning


In [32]:
text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]
# strip whitespaces：去除文本前后的空格
strip_whitespaces=[string.strip() for string in text_data]
strip_whitespaces
# remove periods
remove_periods=[string.replace('.','') for string in strip_whitespaces]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [33]:
# we also create and apply a custom transformation function

def capitalizer(string:str)->str:
    return string.upper()
# apply function
[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [34]:
# finally, we can use regular expressions to make powerful string operations

import re
def replace_letters_with_X(string:str)->str:
    return re.sub(r"[a-zA-Z]",'X',string)
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

## 6.2 Parsing and Cleaning HTML: you have text data with HTML elements and want to extract just the text
* use Beautiful Soup's extensive set of options to parse and extract from HTML

In [39]:
from bs4 import BeautifulSoup
# create some HTML code
html ="""
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>"
       """
# parse HTML
soup=BeautifulSoup(html,'lxml')
# find the div with the class "full_name", show text
soup.find("div",{"class":"full_name"}).text # 结果有一点偏差


'\n       Masego Azra'

## 6.3 Removing Punctuation
* 关注 str.translate

In [40]:
import unicodedata
import sys
text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']
# create a dictionary of punctuation characters
punctuation=dict.fromkeys(i for i in range(sys.maxunicode) 
                         if unicodedata.category(chr(i)).startswith('P'))
# for each string, remove any pnctuatioin characters
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

## 6.4 Tokenizing Text: break text into individual words

In [2]:
from nltk.tokenize import word_tokenize
string = "The science of today is the technology of tomorrow"
# Tokenize words
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [1]:
# we can also tokenize into sentences
from nltk.tokenize import sent_tokenize
string = "The science of today is the technology of tomorrow. Tomorrow is today."
# tokenize sentences
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

## 6.5 Removing Stop Words: Given tokenized text data, you want to remove extremely common words(e,g,, a, is, of, on) that contain little informational value

In [3]:
from nltk.corpus import stopwords
tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park']
# load stop words
stop_words=stopwords.words('english') # ，没发现支持中文？？？
# remove stop words
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

## 6.6 Stemming Words: you have tokenized words and want to convert them into their root froms

In [6]:
from nltk.stem.porter import PorterStemmer
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting'] 
# create stemmer
porter=PorterStemmer()
# allpy stemmer
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

## 6.7 Tagging parts of speech: you have text data and want to tag each word or character with its part of speech

In [7]:
from nltk import pos_tag
from nltk import word_tokenize
text_data="Chris loved outdoor running"
# use pre-trained part of speech tagger
text_tagged=pos_tag(word_tokenize(text_data))
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [8]:
# once the text has been tagged, we can use the tags to find certain parts of speech. For example, here are all nouns:

# filter words
[word for word,tag in text_tagged if tag in ['NN','NNS','NNP',"NNPS"]]

['Chris']

In [13]:
# A more realistic situation would be that we have data where every observation 
# contains a tweet and we want to convert those sentences into features for 
# individual parts of speech (e.g., a feature with 1 if a proper noun is present, and 0 otherwise):

from sklearn.preprocessing import MultiLabelBinarizer
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]
# create list
tagged_tweets=[]
# tag each word and each tweet
for tweet in tweets:
    tweet_tag=pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word,tag in tweet_tag])
# use one-hot encoding to convert the tags into features
one_hot_multi=MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [14]:
# Load library
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# Get some text from the Brown Corpus, broken into sentences
sentences = brown.tagged_sents(categories='news')

# Split into 4000 sentences for training and 623 for testing
train = sentences[:4000]
test = sentences[4000:]

# Create backoff tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

# Show accuracy
trigram.evaluate(test)


0.8174734002697437

## 6.8 Encoding Text as a Bag of Words: you have text data and want to create a set of features indicating the number of times an observation's text contains a particular word

In [16]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
# create the bag of words feature matrix
count=CountVectorizer()
bag_of_words=count.fit_transform(text_data)
bag_of_words
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [17]:
# we  can use the "vocabulary_" method to view the word associated with each feature
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [18]:
# Create feature matrix with arguments
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words="english",
                              vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)

# View feature matrix
print(bag.toarray())
# View the 1-grams and 2-grams
print(count_2gram.vocabulary_)

[[2]
 [0]
 [0]]
{'brazil': 0}


## 6.9 Weighting Word Importance: you want a bag of word, but with words weighted by their importance to an observation.

In [21]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both']) 
tfidf=TfidfVectorizer()
feature_matrix=tfidf.fit_transform(text_data)
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [23]:
# show tf-idf feature matrix as dense matrix
feature_matrix.toarray()
# show feature names
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

# Capter 7 Handling Dates and Times
## 7.1 Converting Strings to Dates: given a vector of strings representing dates and times, you want to transform them into time series data

In [24]:
import numpy as np
import pandas as pd
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])
# convert to datetime
[pd.to_datetime(date,format='%d-%m-%Y %I:%M %p') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [25]:
# we might also want to add an argument to the  "errors"  parameter to handle problems

[pd.to_datetime(date,format='%d-%m-%Y %I:%M %p',errors='coerce') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

## 7.2 Handling Time Zones: if not specified, pandas objects have no time zone. However,  we can add a time zone using "tz" during creation

In [26]:
import pandas as pd
pd.Timestamp('2017-05-01 06:00:00',tz='Europe/London')

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [27]:
# we can add a time zone to a previously created datetime using tz_localize:
date=pd.Timestamp('2017-05-01 06:00:00')
# set time zone
date_in_london=date.tz_localize('Europe/London')
date_in_london

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [29]:
#  we can also convert to a different time zone

date_in_london.tz_convert('Africa/Abidjan')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [30]:
# Finally, pandas' Series objects can apply "tz_localize" and "tz_convert" to every element

dates=pd.Series(pd.date_range('2/2/2002',periods=3,freq='M'))
# set time zone
dates.dt.tz_localize('Africa/Abidjan')

0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [31]:
# show all time zones
from pytz import all_timezones
all_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau',
 'Africa/Blantyre',
 'Africa/Brazzaville',
 'Africa/Bujumbura',
 'Africa/Cairo',
 'Africa/Casablanca',
 'Africa/Ceuta',
 'Africa/Conakry',
 'Africa/Dakar',
 'Africa/Dar_es_Salaam',
 'Africa/Djibouti',
 'Africa/Douala',
 'Africa/El_Aaiun',
 'Africa/Freetown',
 'Africa/Gaborone',
 'Africa/Harare',
 'Africa/Johannesburg',
 'Africa/Juba',
 'Africa/Kampala',
 'Africa/Khartoum',
 'Africa/Kigali',
 'Africa/Kinshasa',
 'Africa/Lagos',
 'Africa/Libreville',
 'Africa/Lome',
 'Africa/Luanda',
 'Africa/Lubumbashi',
 'Africa/Lusaka',
 'Africa/Malabo',
 'Africa/Maputo',
 'Africa/Maseru',
 'Africa/Mbabane',
 'Africa/Mogadishu',
 'Africa/Monrovia',
 'Africa/Nairobi',
 'Africa/Ndjamena',
 'Africa/Niamey',
 'Africa/Nouakchott',
 'Africa/Ouagadougou',
 'Africa/Porto-Novo',
 'Africa/Sao_Tome',
 'Africa/Timbuktu',
 'Africa/

## 7.3 Selecting Dates and TImes: you have a vector of dates and you want to select one or more

In [33]:
import pandas as pd
df=pd.DataFrame()
df['date']=pd.date_range('1/1/2001',periods=100000,freq='H')
df[(df['date']>"2002-1-1 01:00:00") & 
  (df['date']<='2002-1-1 04:00:00')]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [34]:
# alternatively, we can set the date column as the DataFrame's index and then slice using loc

df=df.set_index(df['date'])
# select observaitons between two datetimes
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


## 7.4 Breaking Up Data into Multiple Features: you have a columns of dates and times and you want to create features for year, month, day, hour, and minute.

In [36]:
import pandas as pd
df=pd.DataFrame()
df['date']=pd.date_range('1/1/2001',periods=150,freq='W')
# create features for year, month, day, hour, and minute
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day
df['hour']=df['date'].dt.hour
df['minute']=df['date'].dt.minute

df.head()

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


## 7.5 Calculating the Difference Between Dates: you have two datetime features and want to calculate the time between them for each observation

In [38]:
import pandas as pd
df=pd.DataFrame()
df['Arrived']=[pd.Timestamp('04-01-2017'),pd.Timestamp('04-05-2017')]
df['Left']=[pd.Timestamp('04-01-2017'),pd.Timestamp('04-06-2017')]
print(df)
# calculate duration between features
df['Left']-df['Arrived']

     Arrived       Left
0 2017-04-01 2017-04-01
1 2017-04-05 2017-04-06


0   0 days
1   1 days
dtype: timedelta64[ns]

In [39]:
# often we will want to remove the day output and keep only the numerical value

pd.Series(delta.days for delta in (df['Left']-df['Arrived']))

0    0
1    1
dtype: int64

## 7.6 Encoding Days of the Week: you have a vector of dates and want to know the day of the week for each date.

In [40]:
import pandas as pd
dates=pd.Series(pd.date_range('2/2/2002',periods=5,freq="M"))
# show days of the week
dates.dt.weekday_name

0    Thursday
1      Sunday
2     Tuesday
3      Friday
4      Sunday
dtype: object

In [41]:
# if we c=want the output to be a numerical value and therefore more usable as a machine learning feature, we can use weekday where 
# the days of the week are represented as an integer( Monday is 0)

dates.dt.weekday

0    3
1    6
2    1
3    4
4    6
dtype: int64

## 7.7 Creating a Lagged Festure: want to create a featurer that is lagged  n time periods
* use pandas' shift

## 7.8 Using Rolling Time Windows： you want to calculate some statistics for a rolling time

In [42]:
import pandas as pd
time_index=pd.date_range('01/01/2010',periods=5,freq='M')
df=pd.DataFrame(index=time_index)
df['stock_price']=[1,2,3,4,5]
# calculate rolling mean
df.rolling(window=2).mean()

Unnamed: 0,stock_price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


## 7.9 Handling MIssing Data in TIme series

In [51]:
import pandas as pd
import numpy as np
time_index=pd.date_range('01/01/2010',periods=5,freq='M')
df=pd.DataFrame(index=time_index)
df['sales']=[1,2,np.nan,np.nan,5]
# interpolate missing values
print('------linear\n',df.interpolate())
print('\n------quadratic\n',df.interpolate(method='quadratic'))

------linear
             sales
2010-01-31    1.0
2010-02-28    2.0
2010-03-31    3.0
2010-04-30    4.0
2010-05-31    5.0

------quadratic
                sales
2010-01-31  1.000000
2010-02-28  2.000000
2010-03-31  3.059808
2010-04-30  4.038069
2010-05-31  5.000000


In [44]:
# alternatively, we can replace the missing values with the last known value(forward-filling)

df.ffill()

Unnamed: 0,sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [45]:
# we can also replace missing values with the latest known value(back-filling)

df.bfill()

Unnamed: 0,sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [53]:
# Finally, there might be cases when we have large gaps of missing values and do not want to interpolate values across the entire gap. 
# In these cases we can use limit to restrict the number of interpolated values and limit_direction to set whether to interpolate values 
# forward from at the last known value before the gap or vice versa:
 
df.interpolate(limit=1, limit_direction="forward",method='quadratic')


Unnamed: 0,sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,
2010-05-31,5.0


# Chapter 8 Handling Images

# Chapter 9 Dimensionality Reduction Using Festure _Extraction_

## 9.1 Reducing Features Using Principle Components: reduce the number of features while retaining the variance in the data

In [55]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets
digits=datasets.load_digits()
# standardize the feature matrix
features=StandardScaler().fit_transform(digits.data)
# create a PCA that will retain 99% of variance
pca=PCA(n_components=0.99,whiten=True)
# conduct PCA
features_pca=pca.fit_transform(features)
# show results
print('original number of features:', features.shape[1])
print('reduced number of features:',features_pca.shape[1])


original number of features: 64
reduced number of features: 54


## 9.2 Reducing Features When Data is Linearly Inseparable: you suspect you have linearly inseparable data and want to reduce the dimensions
* Use an extension of principal component analysis that uses kernels to allow for non-linear dimensionality reduction

In [56]:
from sklearn.decomposition import PCA,KernelPCA
from sklearn.datasets import make_circles
# create linearly inseparable data
features,_=make_circles(n_samples=1000,random_state=1,noise=0.1,factor=0.1)
# apply kernel PCA with radius basis function(RBF) kernel
kpca=KernelPCA(kernel='rbf',gamma=15,n_components=1)
features_kpca=kpca.fit_transform(features)

print('original number of features:',features.shape[1])
print('reduced number of features:',features_kpca.shape[1])

original number of features: 2
reduced number of features: 1


## 9.3 Reducing Features by Maximizing Class Separability: reduce the features to be used by a classifier

In [57]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
iris=datasets.load_iris()
features=iris.data
target=iris.target

# create and run an LDA, then use it to transform the features
lda=LinearDiscriminantAnalysis(n_components=1)
features_lda=lda.fit(features,target).transform(features)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_lda.shape[1])

Original number of features: 4
Reduced number of features: 1


## 9.4 Reducing Feature Using Matrix Factorization: you have a feature matrix of nonnegative values and want to reduce the dimensionality

In [60]:
from sklearn.decomposition import NMF # non-negative matrix factorization
from sklearn import datasets
digits=datasets.load_digits()
features=digits.data
# create , fit, and apply NMF
nmf=NMF(n_components=10,random_state=1)
features_nmf=nmf.fit_transform(features)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_nmf.shape[1])

Original number of features: 64
Reduced number of features: 10


## 9.5 Reducing Features on Sparse Data: you have a sparse matrix and want to reduce the dimensionality

In [63]:
# use Truncated Singular Value Decomposition(TSVD)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np
digits=datasets.load_digits()
# standardize feature matrix
features=StandardScaler().fit_transform(digits.data)
# make sparse matrix
features_sparse=csr_matrix(features)
# create TSVD on sparse matrix
tsvd=TruncatedSVD(n_components=10)
# conduct TSVD on sparse matrix
features_sparse_tsvd=tsvd.fit(features_sparse).transform(features_sparse)

print("Original number of features:", features_sparse.shape[1])
print('Reduced number of features:', features_sparse_tsvd.shape[1])

Original number of features: 64
Reduced number of features: 10


# Chapter 10 Dimensionality Reduction Using Feature _Selection_
## 10.1 Thresholding Numerical Feature Variance: you have a set of numerical features and want to remove those with low variance(i.e., likely containing little information)

In [64]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

## 10.2 Thresholding Binary Feature Variance: You have a set of binary categorical features and want to remove those with low variance (i.e., likely containing little information).

In [65]:
from sklearn.feature_selection import VarianceThreshold

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

## 10.3 Handling Highly Correlated Features: You have a feature matrix and suspect some features are highly correlated

In [66]:
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# Create correlation matrix
corr_matrix = dataframe.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                          k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


## 10.4 Removing Irrelevant Features for Classification: You have a categorical target vector and want to remove uninformative features.

In [67]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Load data
iris = load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [68]:
# If the features are quantitative, compute the ANOVA F-value between each feature and the target vector:

# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [69]:
# Instead of selecting a specific number of features, we can also use SelectPercentile to select the top n percent of features:

# Load library
from sklearn.feature_selection import SelectPercentile

# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


## 10.5 Recursively Eliminating Features: You want to automatically select the best features to keep.

In [70]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  1.13220029,  0.7031277 , ..., -0.26903836,
        -0.19448816,  1.5417181 ],
       [-1.07500204,  1.02994989,  2.56148527, ..., -0.71990239,
         1.24355995,  0.90054733],
       [ 1.37940721, -1.58203529, -1.77039484, ...,  0.49558942,
        -0.59421636,  1.23699023],
       ...,
       [-0.80331656, -0.3140609 , -1.60648007, ...,  0.22900133,
         0.93401741,  0.07420836],
       [ 0.39508844, -0.6374364 , -1.34564911, ...,  0.02723818,
         0.89998868, -0.62569347],
       [-0.55383035, -2.20671933,  0.82880112, ..., -1.08865284,
         1.32131624,  1.46404781]])

# Chapter 11 Model Evaluation
## 11.1 Cross-Validation Models: you want to evaluate how well your model work in the real world 

In [71]:
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Load digits dataset
digits = datasets.load_digits()

# Create features matrix
features = digits.data

# Create target vector
target = digits.target

# Create standardizer
standardizer = StandardScaler()

# Create logistic regression object
logit = LogisticRegression()

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)

# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Conduct k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             features, # Feature matrix
                             target, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

# Calculate mean
cv_results.mean()

0.964931719428926

## 11.2 Creating a Baseline Regression Model