In [1]:
import pandas as pd

In [2]:
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'], 
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'], 
                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'], 
                  'quantitative_column':[1, 11, -.5, 10, None, 20]})

In [3]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [4]:
X.isnull().sum()

city                   1
boolean                1
ordinal_column         0
quantitative_column    1
dtype: int64

In [6]:
# oh no, let's impute some values, imputer has a most_frequent option, but it only works if categories are integers
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline

In [15]:
X['city']

0            tokyo
1             None
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

In [10]:
X['city'].value_counts().index[0]  # most common category

'tokyo'

In [14]:
X['city'].fillna(X['city'].value_counts().index[0])  # fill empty slots with most common category

0            tokyo
1            tokyo
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

In [16]:
df = X.copy()
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [17]:
from sklearn.base import TransformerMixin

class CustomCategoryImputer(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X
    
    def fit(self, *_):
        return self

In [18]:
cci = CustomCategoryImputer(cols=['city', 'boolean'])

In [21]:
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [25]:
cci.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [None]:
# still have null values in quantitative_column, 
# because default imputer cannot do select, columns, lets'a make a custom one

In [26]:
# Lets make an imputer that can apply a strategy to select columns by name

class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self, *_):
        return self

In [27]:
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')

cqi.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [28]:
imputer = Pipeline([('quant', cqi), ('category', cci)])

imputer.fit_transform(X)  # ready for action

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [29]:
# will automatically find categorical variables
pd.get_dummies(X, 
               prefix_sep='__')  # the seperator between the prefix (column name) and cell value

Unnamed: 0,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes,ordinal_column__dislike,ordinal_column__like,ordinal_column__somewhat like
0,1.0,0,0,0,1,0,1,0,0,1
1,11.0,0,0,0,0,1,0,0,1,0
2,-0.5,1,0,0,0,0,0,0,0,1
3,10.0,0,0,1,0,1,0,0,1,0
4,,0,1,0,0,1,0,0,0,1
5,20.0,0,0,0,1,0,1,1,0,0


In [30]:
pd.get_dummies(X,  prefix_sep='__')  # for ordinal columns, we don't want to dummify

Unnamed: 0,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes,ordinal_column__dislike,ordinal_column__like,ordinal_column__somewhat like
0,1.0,0,0,0,1,0,1,0,0,1
1,11.0,0,0,0,0,1,0,0,1,0
2,-0.5,1,0,0,0,0,0,0,0,1
3,10.0,0,0,1,0,1,0,0,1,0
4,,0,1,0,0,1,0,0,0,1
5,20.0,0,0,0,1,0,1,1,0,0


In [31]:
pd.get_dummies(X, 
               columns = ['city', 'boolean'],   # which columns to dummify
               prefix_sep='__')  # the seperator between the prefix (column name) and cell value

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


In [32]:
from sklearn.base import TransformerMixin

class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols)
    
    def fit(self, *_):
        return self

In [33]:
cd = CustomDummifier(cols=['boolean', 'city'])

cd.fit_transform(X)

Unnamed: 0,ordinal_column,quantitative_column,boolean_no,boolean_yes,city_london,city_san francisco,city_seattle,city_tokyo
0,somewhat like,1.0,0,1,0,0,0,1
1,like,11.0,1,0,0,0,0,0
2,somewhat like,-0.5,0,0,1,0,0,0
3,like,10.0,1,0,0,0,1,0
4,somewhat like,,1,0,0,1,0,0
5,dislike,20.0,0,1,0,0,0,1


In [34]:
# what about the ordinal_column, we still want to use it and its a string..

In [37]:
ordering = ['dislike', 'somewhat like', 'like']  # 0 for dislike, 1 for somewhat like, and 2 for like

print (X['ordinal_column'])

print (X['ordinal_column'].map(lambda x: ordering.index(x)))

0    somewhat like
1             like
2    somewhat like
3             like
4    somewhat like
5          dislike
Name: ordinal_column, dtype: object
0    1
1    2
2    1
3    2
4    1
5    0
Name: ordinal_column, dtype: int64


In [38]:
from sklearn.base import TransformerMixin

class CustomEncoder(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [None]:
ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])

ce.fit_transform(X)

In [None]:
# name of category is the bin by default
pd.cut(X['quantitative_column'], bins=3)

In [None]:
# using no labels
pd.cut(X['quantitative_column'], bins=3, labels=False)

In [None]:
# using pre-made labels
group_names = ['Low', 'Okay', 'Good']
pd.cut(X['quantitative_column'], bins=3, labels=group_names)

In [None]:
from sklearn.base import TransformerMixin

class CustomCutter(TransformerMixin):
    def __init__(self, col, bins, labels=False):
        self.labels = labels
        self.bins = bins
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X
    
    def fit(self, *_):
        return self

In [None]:
cc = CustomCutter(col='quantitative_column', bins=3)

cc.fit_transform(X)

In [None]:
# note that the output of this is an ordinal column, meaning there is no need to dummify them

In [None]:
# put it all into a pipeline
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])
# will use our initial imputer
# will dummify variables first
# then encode the ordinal column
# then bucket (bin) the quantitative column

In [None]:
X

In [None]:
pipe.fit(X)

In [None]:
pipe.transform(X)  # ready for action

In [None]:
# https://archive.ics.uci.edu/ml/datasets/Activity+Recognition+from+Single+Chest-Mounted+Accelerometer#
    
# --- 1: Working at Computer 
# --- 2: Standing Up, Walking and Going updown stairs 
# --- 3: Standing 
# --- 4: Walking 
# --- 5: Going UpDown Stairs 
# --- 6: Walking and Talking with Someone 
# --- 7: Talking while Standing

In [None]:
df = pd.read_csv('../data/activity_recognizer/1.csv', header=None)
df.columns = ['index', 'x', 'y', 'z', 'activity']

df.head()

In [None]:
df['activity'].value_counts(normalize=True)  # null accuracy (to beat) is .5153

In [None]:
# now lets do some machine learning

# note we are using the dataset with the dropped rows

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

X = df[['x', 'y', 'z']]
# create our feature matrix by removing the response variable
y = df['activity']


# our grid search variables and instances

# KNN parameters to try
knn_params = {'n_neighbors':[3, 4, 5, 6]}

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

In [None]:
# Using Polynomial Features
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

In [None]:
X_poly = poly.fit_transform(X)
X_poly.shape

In [None]:
poly.get_feature_names()

In [None]:
pd.DataFrame(X_poly, columns=poly.get_feature_names()).head()

In [None]:
%matplotlib inline
import seaborn as sns
sns.heatmap(pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr())

In [None]:
pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr()

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)

X_poly = poly.fit_transform(X)
print X_poly.shape

pd.DataFrame(X_poly, columns=poly.get_feature_names()).head()  # x0^2, x1^2, and x2^2 went away as they use repeat variables

In [None]:
sns.heatmap(pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr())

In [None]:
pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr()  # removes from biases from the correlation matrix

In [None]:
from sklearn.pipeline import Pipeline

pipe_params = {'poly_features__degree':[1, 2, 3], 'poly_features__interaction_only':[True, False], 'classify__n_neighbors':[3, 4, 5, 6]}

pipe = Pipeline([('poly_features', poly), ('classify', knn)])

grid = GridSearchCV(pipe, pipe_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

In [None]:
# best accuracy was 0.720752487677 without constructing polynomial features, so we are able to do better!

In [None]:
#http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/
tweets = pd.read_csv('../data/twitter_sentiment.csv', encoding='latin1')

In [None]:
tweets.head()

In [None]:
del tweets['ItemID']

In [None]:
tweets.head()

In [None]:
X = tweets['SentimentText']
y = tweets['Sentiment']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vect = CountVectorizer()
_ = vect.fit_transform(X)
print _.shape

In [None]:
vect = CountVectorizer(stop_words='english')  # removes a set of english stop words (if, a, the, etc)
_ = vect.fit_transform(X)
print _.shape

In [None]:
vect.get_stop_words()

In [None]:
vect = CountVectorizer(min_df=.05)  # only includes words that occur in at least 5% of the corpus documents
# used to skim the number of features
_ = vect.fit_transform(X)
print _.shape

In [None]:
vect = CountVectorizer(max_df=.8)  # only includes words that occur at most 80% of the documents
# used to "Deduce" stop words
_ = vect.fit_transform(X)
print _.shape

In [None]:
vect = CountVectorizer(ngram_range=(1, 5))  # also includes phrases up to 5 words
_ = vect.fit_transform(X)
print _.shape  # explodes the number of features

In [None]:
vect.get_feature_names()[:15]

In [None]:
vect = CountVectorizer(lowercase=True)  # lower cases everything first
_ = vect.fit_transform(X)
print _.shape  # features stays the same

In [None]:
vect = CountVectorizer(max_features=1000)  # hard limits the features  based on max counts
_ = vect.fit_transform(X)
print _.shape

In [None]:
vect = CountVectorizer(analyzer='word')  # default analyzer, decides to split into words
_ = vect.fit_transform(X)
print _.shape  

In [None]:
vect = CountVectorizer(analyzer='char')  # used characters
_ = vect.fit_transform(X)
print _.shape  

In [None]:
vect.get_feature_names()[:10]

In [None]:
vect = CountVectorizer(analyzer='char_wb')  # uses characters again but only those are aren't at the beginning or ends of words
# wb stands for word boudnaries
_ = vect.fit_transform(X)
print _.shape  

In [None]:
# making a custom analyzer

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

In [None]:
stemmer.stem('hello')

In [None]:
stemmer.stem('interesting')

In [None]:
stemmer.stem('interesting') == stemmer.stem('interest')

In [None]:
# define a function that accepts text and returns a list of lemmas
def word_tokenize(text, how='lemma'):
    words = text.split(' ')  # tokenize into words
    return [stemmer.stem(word) for word in words]

In [None]:
word_tokenize("hello you are very interesting")

In [None]:
vect = CountVectorizer(analyzer=word_tokenize)
_ = vect.fit_transform(X)
print _.shape  # fewer features as stemming makes words smaller

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = CountVectorizer()
_ = vect.fit_transform(X)
print _.shape, _[0,:].mean()

vect = TfidfVectorizer()
_ = vect.fit_transform(X)
print _.shape, _[0,:].mean()  # same number of rows and columns, different cell values

In [None]:
# Now let's try some machine learning

In [None]:
# get the null accuracy
y.value_counts(normalize=True)  # 0.56463

In [None]:
from sklearn.naive_bayes import MultinomialNB  # for faster predictions with large number of features...

In [None]:
# set our pipeline parameters
pipe_params = {'vect__ngram_range':[(1, 1), (1, 2)], 'vect__max_features':[1000, 10000], 'vect__stop_words':[None, 'english']}

# instantiate our pipeline
pipe = Pipeline([('vect', CountVectorizer()), ('classify', MultinomialNB())])

# instantiate our gridsearch object
grid = GridSearchCV(pipe, pipe_params)
# fit the gridsearch object
grid.fit(X, y)

# get our results
print grid.best_score_, grid.best_params_

In [None]:
from sklearn.pipeline import FeatureUnion

# build a separate featurizer object
featurizer = FeatureUnion([('tfidf_vect', TfidfVectorizer()), ('count_vect', CountVectorizer())])

In [None]:
_ = featurizer.fit_transform(X)
print _.shape  # same number of rows , but twice as many columns as either CV or TFIDF

In [None]:
featurizer.set_params(tfidf_vect__max_features=100, 
                      count_vect__ngram_range=(1, 2), 
                      count_vect__max_features=300)
# the TfidfVectorizer will only keep 100 words while the CountVectorizer will keep 300 of 1 and 2 word phrases
_ = featurizer.fit_transform(X)
print _.shape  # same number of rows , but twice as many columns as either CV or TFIDF

In [None]:
pipe_params = {'featurizer__count_vect__ngram_range':[(1, 1), (1, 2)], 'featurizer__count_vect__max_features':[1000, 10000], 'featurizer__count_vect__stop_words':[None, 'english'],
              'featurizer__tfidf_vect__ngram_range':[(1, 1), (1, 2)], 'featurizer__tfidf_vect__max_features':[1000, 10000], 'featurizer__tfidf_vect__stop_words':[None, 'english']}


pipe = Pipeline([('featurizer', featurizer), ('classify', MultinomialNB())])

grid = GridSearchCV(pipe, pipe_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_