In [None]:
import sklearn.base
from sklearn.datasets import load_iris

iris = load_iris()
iris.data[:5]

In [8]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)

In [10]:
correct = (iris.target == y_pred).sum()
print(f"Total testing num {iris.data.shape[0]} and accuracy {float(correct) / iris.data.shape[0]} ")

Total testing num 150 and accuracy 0.96 


In [13]:
gnb.score(iris.data, iris.target)

0.96

# San Francisco Crime Classification Prediction

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

In [45]:
api.competition_download_files("sf-crime", path="./data")

# Loading Data

In [1]:
from zipfile import ZipFile
import pandas as pd
import numpy as np


In [2]:
zip_file = ZipFile("data/sf-crime.zip")
zip_file.namelist()

['sampleSubmission.csv.zip', 'test.csv.zip', 'train.csv.zip']

In [3]:
train_file = ZipFile(zip_file.open("train.csv.zip"))
train_file.namelist()

['train.csv']

In [4]:
test_file = ZipFile(zip_file.open("test.csv.zip"))
test_file.namelist()

['test.csv']

In [5]:
train = pd.read_csv(train_file.open("train.csv"), parse_dates=["Dates"])
test = pd.read_csv(test_file.open("test.csv"), parse_dates=["Dates"])

In [6]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [7]:
train.shape, test.shape

((878049, 9), (884262, 7))

In [8]:
test.columns

Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y'], dtype='object')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   Dates       878049 non-null  datetime64[ns]
 1   Category    878049 non-null  object        
 2   Descript    878049 non-null  object        
 3   DayOfWeek   878049 non-null  object        
 4   PdDistrict  878049 non-null  object        
 5   Resolution  878049 non-null  object        
 6   Address     878049 non-null  object        
 7   X           878049 non-null  float64       
 8   Y           878049 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB


In [10]:
pd.get_dummies(train["Category"])

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
878045,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
878046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
878047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
train["Category"].nunique()

39

In [12]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [13]:
ohe = OneHotEncoder()
ohe.fit_transform(train[["Category"]]).toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
transformer = FunctionTransformer(func=pd.DataFrame.apply,
                                  kw_args={"func": pd.DataFrame.astype,
                                           "dtype": "category"})

In [15]:
transformer.transform(train)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [16]:
print(train.dtypes)

Dates         datetime64[ns]
Category              object
Descript              object
DayOfWeek             object
PdDistrict            object
Resolution            object
Address               object
X                    float64
Y                    float64
dtype: object


In [110]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [89]:
hour_transformer = FunctionTransformer(func=pd.DataFrame.assign,
                                       kw_args={"hour":lambda x:x.Dates.dt.hour,
                                                "month":lambda x:x.Dates.dt.month,
                                                "year":lambda x:x.Dates.dt.year})

In [90]:
hour_transformer.fit_transform(train)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,hour,month,year
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23,5,2015
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23,5,2015
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23,5,2015
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23,5,2015
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23,5,2015
...,...,...,...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,0,1,2003
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,0,1,2003
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,0,1,2003
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,0,1,2003


In [98]:
cat_type = ['DayOfWeek', 'PdDistrict', 'Resolution','hour','month','year']
convert_col_type = dict(zip(cat_type, ["category"] * len(cat_type)))
convert_col_type

{'DayOfWeek': 'category',
 'PdDistrict': 'category',
 'Resolution': 'category',
 'hour': 'category',
 'month': 'category',
 'year': 'category'}

In [None]:
type_transformer = FunctionTransformer(func=pd.DataFrame.astype,
                                  kw_args={"dtype": convert_col_type})
print(type_transformer.transform(train).dtypes)

In [82]:
ct = make_column_transformer(
    (OneHotEncoder(drop="first"), make_column_selector(dtype_include="category")),remainder="drop",verbose=4)

In [127]:
pipe = make_pipeline(hour_transformer,type_transformer,ct,BernoulliNB(),verbose=3)
pipe

In [128]:
X = train.drop("Category",axis="columns")
y = train["Category"]
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.6)

In [129]:
pipe.fit(X_train,y_train)

[Pipeline]  (step 1 of 4) Processing functiontransformer-1, total=   0.1s
[Pipeline]  (step 2 of 4) Processing functiontransformer-2, total=   0.1s
[ColumnTransformer] . (1 of 1) Processing onehotencoder, total=   0.7s
[Pipeline] . (step 3 of 4) Processing columntransformer, total=   0.7s
[Pipeline] ....... (step 4 of 4) Processing bernoullinb, total=   2.2s


In [130]:
y_pred = pipe.predict_proba(X_val)

In [131]:
log_loss(y_val,y_pred)

2.2313411591221386

In [132]:
pipe.score(X_val,y_val)

0.33493821536358975