In [4]:
from functools import reduce

import pandas as pd
import pprint

class Classifier():
    data = None
    class_attr = None
    priori = {}
    cp = {}
    hypothesis = None


    def __init__(self,filename=None, class_attr=None ):
        self.data = pd.read_csv(filename, sep=',', header =(0))
        self.class_attr = class_attr

    '''
        probability(class) =    How many  times it appears in cloumn
                             __________________________________________
                                  count of all class attribute
    '''
    def calculate_priori(self):
        class_values = list(set(self.data[self.class_attr]))
        class_data =  list(self.data[self.class_attr])
        for i in class_values:
            self.priori[i]  = class_data.count(i)/float(len(class_data))
        print ("Priori Values: ", self.priori)

    '''
        Here we calculate the individual probabilites 
        P(outcome|evidence) =   P(Likelihood of Evidence) x Prior prob of outcome
                               ___________________________________________
                                                    P(Evidence)
    '''
    def get_cp(self, attr, attr_type, class_value):
        data_attr = list(self.data[attr])
        class_data = list(self.data[self.class_attr])
        total =1
        for i in range(0, len(data_attr)):
            if class_data[i] == class_value and data_attr[i] == attr_type:
                total+=1
        return total/float(class_data.count(class_value))

    '''
        Here we calculate Likelihood of Evidence and multiple all individual probabilities with priori
        
        (Outcome|Multiple Evidence) = P(Evidence1|Outcome) x P(Evidence2|outcome) x ... x P(EvidenceN|outcome) x P(Outcome)
        
        scaled by P(Multiple Evidence)
    '''
    def calculate_conditional_probabilities(self, hypothesis):
        for i in self.priori:
            self.cp[i] = {}
            for j in hypothesis:
                self.cp[i].update({ hypothesis[j]: self.get_cp(j, hypothesis[j], i)})
        print ("\nCalculated Conditional Probabilities: \n")
        pprint.pprint(self.cp)

    def classify(self):
        print ("Result: ")
        for i in self.cp:
            print (i, " ==> ", reduce(lambda x, y: x*y, self.cp[i].values())*self.priori[i])

if __name__ == "__main__":
    c = Classifier(filename="Weather.csv", class_attr="Play" )
    c.calculate_priori()
    #c.hypothesis = {"Outlook":'Rainy', "Temp":"Mild", "Humidity":'Normal' , "Windy":'t'}
    c.hypothesis = {'Outlook':'Sunny',"Temp":'Cool',"Humidity":'High','Windy':'t'}

    c.calculate_conditional_probabilities(c.hypothesis)
    c.classify()

Priori Values:  {'yes': 0.6428571428571429, 'no': 0.35714285714285715}

Calculated Conditional Probabilities: 

{'no': {'Cool': 0.4, 'High': 1.0, 'Sunny': 0.6, 't': 0.8},
 'yes': {'Cool': 0.4444444444444444,
         'High': 0.4444444444444444,
         'Sunny': 0.4444444444444444,
         't': 0.4444444444444444}}
Result: 
yes  ==>  0.0250832843425436
no  ==>  0.06857142857142857


![title]("Images/Single.png")

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
Data = pd.read_csv('Weather.csv')

In [3]:
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play,Unnamed: 5
0,Rainy,Hot,High,f,no,
1,Rainy,Hot,High,t,no,
2,Overcast,Hot,High,f,yes,
3,Sunny,Mild,High,f,yes,
4,Sunny,Cool,Normal,f,yes,
5,Sunny,Cool,Normal,t,no,
6,Overcast,Cool,Normal,t,yes,
7,Rainy,Mild,High,f,no,
8,Rainy,Cool,Normal,f,yes,
9,Sunny,Mild,Normal,f,yes,



**Play**

|Play|= 14|
|---|--|
|YES|9|
|NO|5|

**Windy**

|Windy|= 14|
|---|--|
|True|6|
|False|8|

**Humidity**

|Humidity|= 14|
|---|--|
|Normal|7|
|High|7|

**TEMP**

|Temp|= 14|
|---|--|
|Cool|4|
|Mild|6|
|Hot|4|

**Outlook**

|outlook|= 14|
|---|--|
|overcast|4|
|Rainy|5|
|Sunny|5|

In [4]:
x = Data['Outlook'].describe()
x

count        14
unique        3
top       Sunny
freq          5
Name: Outlook, dtype: object

In [5]:
y = Data['Play'].describe()
y

count      14
unique      2
top       yes
freq        9
Name: Play, dtype: object

In [4]:
Data = Data.drop('Unnamed: 5',axis=1)
Data.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,Rainy,Hot,High,f,no
1,Rainy,Hot,High,t,no
2,Overcast,Hot,High,f,yes
3,Sunny,Mild,High,f,yes
4,Sunny,Cool,Normal,f,yes


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

In [7]:
Data['Outlook'] = le.fit_transform(Data['Outlook'])
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,Hot,High,f,no
1,1,Hot,High,t,no
2,0,Hot,High,f,yes
3,2,Mild,High,f,yes
4,2,Cool,Normal,f,yes
5,2,Cool,Normal,t,no
6,0,Cool,Normal,t,yes
7,1,Mild,High,f,no
8,1,Cool,Normal,f,yes
9,2,Mild,Normal,f,yes


In [8]:
Data['Temp'] = le.fit_transform(Data['Temp'])
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,1,High,f,no
1,1,1,High,t,no
2,0,1,High,f,yes
3,2,2,High,f,yes
4,2,0,Normal,f,yes
5,2,0,Normal,t,no
6,0,0,Normal,t,yes
7,1,2,High,f,no
8,1,0,Normal,f,yes
9,2,2,Normal,f,yes


In [9]:
Data['Humidity'] = le.fit_transform(Data['Humidity'])
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,1,0,f,no
1,1,1,0,t,no
2,0,1,0,f,yes
3,2,2,0,f,yes
4,2,0,1,f,yes
5,2,0,1,t,no
6,0,0,1,t,yes
7,1,2,0,f,no
8,1,0,1,f,yes
9,2,2,1,f,yes


In [22]:
Data['Windy'] = le.fit_transform(Data['Windy'])
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,1,0,0,no
1,1,1,0,1,no
2,0,1,0,0,yes
3,2,2,0,0,yes
4,2,0,1,0,yes
5,2,0,1,1,no
6,0,0,1,1,yes
7,1,2,0,0,no
8,1,0,1,0,yes
9,2,2,1,0,yes


In [24]:
Data['Play'] = le.fit_transform(Data['Play'])
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,1,0,0,0
1,1,1,0,1,0
2,0,1,0,0,1
3,2,2,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,0,0,1,1,1
7,1,2,0,0,0
8,1,0,1,0,1
9,2,2,1,0,1


In [26]:
Data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,1,1,0,0,0
1,1,1,0,1,0
2,0,1,0,0,1
3,2,2,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,0,0,1,1,1
7,1,2,0,0,0
8,1,0,1,0,1
9,2,2,1,0,1


In [28]:
X = Data.drop(['Play'],axis =1)
X.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,1,1,0,0
1,1,1,0,1
2,0,1,0,0
3,2,2,0,0
4,2,0,1,0


In [31]:
y = Data['Play']
y

0     0
1     0
2     1
3     1
4     1
5     0
6     1
7     0
8     1
9     1
10    1
11    1
12    1
13    0
Name: Play, dtype: int64

In [34]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3) # 70% training and 30% test

In [36]:
X_train.shape

(9, 4)

In [38]:
X_test.shape

(5, 4)

In [40]:
y_test.shape

(5,)

In [42]:
y_train.shape

(9,)

In [46]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
#Create a Gaussian Classifier
gnb = GaussianNB()
gnb

GaussianNB(priors=None, var_smoothing=1e-09)

In [48]:
#Train the model using the training sets
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [50]:
gnb.class_count_

array([3., 6.])

In [52]:
gnb.classes_

array([0, 1], dtype=int64)

In [54]:
gnb.class_prior_

array([0.33333333, 0.66666667])

In [64]:
New_Predict = gnb.predict(X_test)
New_Predict

array([1, 1, 0, 1, 0], dtype=int64)

In [65]:
y_test

10    1
5     0
0     0
8     1
11    1
Name: Play, dtype: int64

In [61]:
gnb.score(X_train,y_train)

0.7777777777777778

In [63]:
gnb.score(X_test,y_test)

0.6

In [67]:
Mnb= MultinomialNB()

In [70]:
Mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
Mnb.class_count_

array([3., 6.])

In [74]:
Mnb.classes_

array([0, 1], dtype=int64)

In [76]:
New_Predicts = Mnb.predict(X_test)

In [82]:
New_Predicts

array([1, 1, 1, 1, 0], dtype=int64)

In [83]:
y_test

10    1
5     0
0     0
8     1
11    1
Name: Play, dtype: int64

In [81]:
Mnb.score(X_train,y_train)

0.8888888888888888

In [87]:
Mnb.score(X_test,y_test)

0.4

### Assigmnet 

In [7]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
wine = datasets.load_wine()

In [8]:
# print the names of the 13 features
print("Features: ", wine.feature_names)

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [18]:
# print the label type of wine(class_0, class_1, class_2)
print("Labels: ", wine.target_names)

Labels:  ['class_0' 'class_1' 'class_2']


In [19]:
wine.data.shape

(178, 13)

In [25]:
# print the wine data features (top 5 records)
print(wine.data[0:5])

[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
  2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
  2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
  3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
 [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
  2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
 [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
  3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]


In [26]:
# print the wine labels (0:Class_0, 1:class_2, 2:class_2)
print(wine.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


**Splitting Data**

First, you separate the columns into dependent and independent variables(or features and label). Then you split those variables into train and test set.


In [20]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3) # 70% training and 30% test

In [29]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
#Predict the response for test dataset
y_pred = gnb.predict(X_test)
y_pred[0:5]

array([1, 2, 0, 1, 1])

In [31]:
gnb.classes_

array([0, 1, 2])

In [32]:
gnb.class_count_

array([41., 46., 37.])

In [51]:
gnb.score(X_train,y_train)

0.9758064516129032

In [52]:
gnb.score(X_test,y_test)

1.0

In [47]:
gnb.predict_proba(X_test)

array([[2.94940091e-14, 9.99992762e-01, 7.23843547e-06],
       [8.29313765e-23, 9.11384052e-17, 1.00000000e+00],
       [1.00000000e+00, 4.47125742e-10, 2.24788211e-33],
       [8.80879152e-13, 1.00000000e+00, 1.50868061e-14],
       [8.57787163e-07, 9.99999142e-01, 3.37479318e-21],
       [9.99999104e-01, 8.96032202e-07, 5.57656047e-33],
       [9.99998631e-01, 1.36857578e-06, 1.70919812e-28],
       [6.91930077e-23, 8.05908694e-13, 1.00000000e+00],
       [2.31010743e-07, 9.99999769e-01, 1.92995563e-20],
       [9.99991524e-01, 8.47626652e-06, 2.93785857e-29],
       [9.99999936e-01, 6.35084505e-08, 5.73995325e-35],
       [5.14685431e-19, 8.04449891e-09, 9.99999992e-01],
       [9.99999998e-01, 2.16388527e-09, 1.98164687e-25],
       [3.14585981e-25, 2.34203419e-08, 9.99999977e-01],
       [6.15241085e-02, 9.38475891e-01, 5.78491265e-31],
       [8.50244234e-07, 9.99999150e-01, 1.48115157e-19],
       [2.87775781e-25, 6.09794985e-16, 1.00000000e+00],
       [5.62936192e-07, 9.99999

In [53]:
import numpy as np
gnb.partial_fit(X_train,y_train,np.unique(y_train))

GaussianNB(priors=None, var_smoothing=1e-09)

In [54]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [55]:
## Classification MultinomialNB
multinomial_NB = MultinomialNB()
multinomial_NB

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [57]:
multinomial_NB.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
multinomial_NB.class_count_

array([41., 46., 37.])

In [59]:
multinomial_NB.partial_fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
Y_predict = multinomial_NB.predict(X_test)
Y_predict[0:5]

array([1, 2, 0, 2, 1])

In [61]:
multinomial_NB.score(X_train,y_train)

0.8548387096774194

In [62]:
metrics.accuracy_score(y_test,Y_predict)

0.8333333333333334

**Advantages**

* It is not only a simple approach but also a fast and accurate method for prediction.
Naive Bayes has very low computation cost.
* It can efficiently work on a large dataset.
* It performs well in case of discrete response variable compared to the continuous variable.
* It can be used with multiple class prediction problems.
* It also performs well in the case of text analytics problems.
* When the assumption of independence holds, a Naive Bayes classifier performs better compared to other models like logistic regression.

**Disadvantages**

* The assumption of independent features. In practice, it is almost impossible that model will get a set of predictors which are entirely independent.
* If there is no training tuple of a particular class, this causes zero posterior probability. In this case, the model is unable to make predictions. This problem is known as Zero Probability/Frequency Problem.

**Applications**

The Naive Bayes algorithm is used in multiple real-life scenarios such as

1. **Text classification:** It is used as a probabilistic learning method for text classification. The Naive Bayes classifier is one of the most successful known algorithms when it comes to the classification of text documents, i.e., whether a text document belongs to one or more categories (classes).
2. **Spam filtration:** It is an example of text classification. This has become a popular mechanism to distinguish spam email from legitimate email. Several modern email services imp/7/
lement Bayesian spam filtering.
Many server-side email filters, such as DSPAM, SpamBayes, SpamAssassin, Bogofilter, and ASSP, use this technique.
3. **Sentiment Analysis:** It can be used to analyze the tone of tweets, comments, and reviews—whether they are negative, positive or neutral.
4. **Recommendation System:** The Naive Bayes algorithm in combination with collaborative filtering is used to build hybrid recommendation systems which help in predicting if a user would like a given resource or not.