# Naive Bayes Classification

In [None]:
1. Calculate the probability of observing values given it belongs to one of categories. 

$$P(Dear|Normal) = \frac {P(Normal|Dear) * P(Dear)}{P(Normal)}$$

$$p(N)  x p(Dear|N) x p(Friend |N) $$

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("C:/Users/Owner/codebasics/ML/14_naive_bayes/titanic.csv")

In [6]:
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [7]:
df.drop(["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], inplace=True, axis =1)

In [8]:
df.head()

Unnamed: 0,Pclass,Sex,Age,Survived
0,3,male,22.0,0
1,1,female,38.0,1
2,3,female,26.0,1
3,1,female,35.0,1
4,3,male,35.0,0


In [9]:
target = df.Survived
inputs= df.drop("Survived", axis=1)

In [12]:
dummies = pd.get_dummies(inputs.Sex)

In [14]:
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [15]:
inputs = pd.concat([inputs, dummies], axis =1)

In [16]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,female,male
0,3,male,22.0,0,1
1,1,female,38.0,1,0
2,3,female,26.0,1,0
3,1,female,35.0,1,0
4,3,male,35.0,0,1


In [17]:
inputs.drop("Sex", axis=1, inplace=True)

In [19]:
inputs.isnull().sum()

Pclass      0
Age       177
female      0
male        0
dtype: int64

In [21]:
inputs.Age.fillna(value=inputs.Age.mean(), inplace =True)
inputs.isnull().sum()

Pclass    0
Age       0
female    0
male      0
dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [25]:
from sklearn.naive_bayes import GaussianNB # when data is normally distributedi gaussian-bel curved NB can be used. 

In [39]:
model = GaussianNB()

In [40]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [41]:
model.score(X_test, y_test)

0.770949720670391

In [42]:
model.predict(X_test[:10])

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1], dtype=int64)

In [43]:
model.predict_proba(X_test[:10])

array([[0.97935158, 0.02064842],
       [0.97216646, 0.02783354],
       [0.0029926 , 0.9970074 ],
       [0.01492471, 0.98507529],
       [0.97745251, 0.02254749],
       [0.97857185, 0.02142815],
       [0.81053217, 0.18946783],
       [0.00939742, 0.99060258],
       [0.01944646, 0.98055354],
       [0.01035787, 0.98964213]])

## Spam detection

In [44]:
df = pd.read_csv("C:/Users/Owner/codebasics/ML/14_naive_bayes/spam.csv")

In [45]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [47]:
df["Category"] = df["Category"].apply(lambda x: 0 if x=="ham" else 1) 

In [48]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Category, test_size=0.2)

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)

In [63]:
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [64]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
model = MultinomialNB()

In [66]:
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [68]:
emails = ["Hey mohan, can we get together to watch footbal game tomorrow?", "Upto 20% discount on parking, exclusive offer just for you. Don't miss this reward!"]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [69]:
X_test_count = v.transform(X_test)

In [70]:
model.score(X_test_count, y_test)

0.9775784753363229

each time we call transform method before we give it to the model. 
sklearn pipeline feature where you can define a pipeline of your transformation. 
on raw data we apply transformation before we feed into model. 
for transformation people use tf-idf and so on. 

In [72]:
from sklearn.pipeline import Pipeline 
clf = Pipeline([ 
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
     ])

In [73]:
clf.fit(X_train, y_train)# inrternally it will convert, no need to convert like manual. 

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [74]:
clf.score(X_test, y_test)

0.9775784753363229

In [75]:
clf.predict(emails)

array([0, 1], dtype=int64)

## Exercise

In [1]:
from sklearn.datasets import load_wine

In [4]:
wine = load_wine()

In [6]:
wine.data

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [7]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(wine.data, columns= wine.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [10]:
df.shape

(178, 13)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [12]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [13]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [15]:
df["target"] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [18]:
from sklearn.model_selection import train_test_split

In [20]:
X = df.drop("target", axis=1)
y= df["target"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
from sklearn.naive_bayes import GaussianNB

In [23]:
gnb= GaussianNB()

In [24]:
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
gnb.score(X_test, y_test)

1.0

In [26]:
gnb.predict(X_test)

array([1, 2, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 1, 1, 0, 0, 2, 2, 0, 2, 1, 0,
       0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1])

In [29]:
y_test[:5]

67     1
172    2
102    1
157    2
2      0
Name: target, dtype: int32

In [32]:
df.iloc[0]

alcohol                           14.23
malic_acid                         1.71
ash                                2.43
alcalinity_of_ash                 15.60
magnesium                        127.00
total_phenols                      2.80
flavanoids                         3.06
nonflavanoid_phenols               0.28
proanthocyanins                    2.29
color_intensity                    5.64
hue                                1.04
od280/od315_of_diluted_wines       3.92
proline                         1065.00
target                             0.00
Name: 0, dtype: float64

In [35]:
gnb.predict([[12, 1,2,13,120,2,3,0.1,2,4,2,3,1000]])

array([1])