<a href="https://colab.research.google.com/github/Mjcherono/TrialProjects/blob/main/Python_Programming_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: Naive Bayes

## Example 1: Gaussian Naive Bayes Classifier

In [1]:
# Example 1
# ---
# This type of classifier makes the assumption of normal distribution 
# thus can be best used in cases when all our features are continuous.
# ---
# Question: Predict the species of flower using 4 different features.
# ---
# 


In [2]:
# Load libraries and datasets to be used in this example
#
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [3]:
# Loading our data from python datasets
# 
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
# Splitting our data into a training set and a test set
# 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6) 

In [6]:
# Training our model
# 
clf = GaussianNB()  
model = clf.fit(X_train, y_train) 

In [7]:
# Predicting our test predictors
clf = GaussianNB()  
model = clf.fit(X_train, y_train) 
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

0.9333333333333333


In [8]:
# Predicting a new observation
new_observation = [[ 10,  3,  4,  0.4]]

new_prediction = model.predict(new_observation)
new_prediction

array([1])

## Example 2: Multinomial Naive Bayes Classifier

In [9]:
# Example 2
# ---
# While working with the multinomial naive bayes classifier, the features are assumed to be multinomially distributed. 
# This would mean that this type of classifier is commonly used when we have discrete data (e.g. movie ratings 1 and 5).
# Let us see how this works.
# ----
# Question: Build a model to predict whether an sms message is spam or not.
# ---
# Dataset url = http://bit.ly/SpamCollectionDataset
# ---
# 

In [10]:
# Importing our libraries 

# Importing pandas
import pandas as pd

# Importing numpy
import numpy as np

# We will also download and import nlkt which is a tokenizer. 
# This library will help us break (messages) into individual linguistic units i.e. words.
#
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
# Loading and previewing our dataset
# 
df = pd.read_csv('http://bit.ly/SpamCollectionDataset', sep='\t',  header = None, names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Pre-processing
# We will first emoving useless variance for our task at hand 
# 

# Converting the labels from strings to binary values for our classifier
# 
df['label'] = df.label.map({'ham': 0, 'spam': 1})

# Converting all characters in the message to lower case
# 
df['message'] = df.message.map(lambda x: x.lower())

# Removing any punctuation
# 
df['message'] = df.message.str.replace('[^\w\s]', '')

In [13]:
# Pre-processing 
# Tokenizing the messages into into single words using nltk. 

# Applying the tokenization
# 
df['message'] = df['message'].apply(nltk.word_tokenize)

In [14]:
# Fifth, we will perform some word stemming. 
# The idea of stemming is to normalize our text for all variations of words carry the same meaning, 
# regardless of the tense. One of the most popular stemming algorithms is the Porter Stemmer:
# 
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
 
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])

In [15]:
# Finally, we will transform the data into occurrences, 
# which will be the features that we will feed into our model
# 
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])

In [16]:
# We could leave it as the simple word-count per message, but it is better to use Term Frequency Inverse Document Frequency, more known as tf-idf
# 
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

In [17]:
# Training the Model
# Now that we have performed feature extraction from our data, it is time to build our model. 
# We will start by splitting our data into training and test sets
# 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69)

In [18]:
# Fitting our model 
# Then, all that we have to do is initialize the Naive Bayes Classifier and fit the data. 
# For text classification problems, the Multinomial Naive Bayes Classifier is well-suited
# 
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [19]:
# Evaluating the Model
# Once we have put together our classifier, we can evaluate its performance in the testing set
# 
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

0.9480286738351255


## Example 3: Bernoulli Naive Bayes Classifier

In [20]:
# Example 3
# ---
# Question: It is rare to get a scenario where you have to use the Bernoulli Naive Bayes Classifier. 
# However, such a case would assume that all our features are binary, 
# that is they take only two values (e.g. a nominal categorical feature that has been one-hot encoded).
# In the following example we will generate a dataset to demonstrate the use of this Classifier.
# ---
# 


In [21]:
# Importing our libraries
# 
import numpy as np
from sklearn.naive_bayes import BernoulliNB

In [22]:
# Creating binary features and target data
# 
# Creating three binary features
X = np.random.randint(2, size=(100, 3))

# Creating a binary target vector
y = np.random.randint(2, size=(100, 1)).ravel()

In [23]:
# Viewing first ten observations
# 
X[0:10]

array([[0, 1, 1],
       [0, 0, 0],
       [1, 1, 0],
       [1, 1, 0],
       [0, 0, 0],
       [1, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 1, 1]])

In [24]:
# Training our Bernoulli Naive Bayes Classifier
# 
# Creating oour Bernoulli Naive Bayes object with prior probabilities of each class
clf = BernoulliNB()

# Train model
model = clf.fit(X, y)

# model score
model.score(X, y)

0.52

## <font color="green">Challenge 1</font>

In [25]:
# Challenge 1
# ---
# Question: Build a model to determine whether a mushroom is edible.
# ---
# Dataset url = http://bit.ly/MushroomDataset
# 
mushrooms = pd.read_csv('http://bit.ly/MushroomDataset')
mushrooms.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,mushroom
0,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
2,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE


In [26]:
mushrooms.columns

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat', 'mushroom'],
      dtype='object')

In [27]:
#label encode columns
from sklearn.preprocessing import LabelEncoder

label_object = {}
categorical_columns = ['cap-shape','cap-surface', 'cap-color', 'bruises', 'odor','gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring','stalk-surface-below-ring', 'stalk-color-above-ring','stalk-color-below-ring',
      'veil-type', 'veil-color', 'ring-number','ring-type', 'spore-print-color', 'population', 'habitat', 'mushroom']
for col in categorical_columns:
  labelencoder = LabelEncoder()
  labelencoder.fit(mushrooms[col])
  mushrooms[col] = labelencoder.fit_transform(mushrooms[col])
  label_object[col] = labelencoder

In [28]:
mushrooms.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,mushroom
0,2,3,8,0,0,1,1,1,10,1,1,3,3,7,7,0,2,1,4,6,4,6,0
1,2,3,8,0,0,1,1,1,10,1,1,3,3,7,7,0,2,1,4,1,4,6,0
2,2,3,8,0,0,1,1,1,7,1,1,3,3,7,7,0,2,1,4,6,4,6,0


In [29]:
#split features and labels
X = mushrooms.drop('mushroom',axis=1).values
y = mushrooms['mushroom'].values

#split to train and test
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Creating oour Bernoulli Naive Bayes object with prior probabilities of each class
clf = BernoulliNB()

# Train model
clf.fit(X_train, y_train)

#Making predictions
y_pred = clf.predict(X_test)

#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[792 106]
 [129 657]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       898
           1       0.86      0.84      0.85       786

    accuracy                           0.86      1684
   macro avg       0.86      0.86      0.86      1684
weighted avg       0.86      0.86      0.86      1684



## <font color="green">Challenge 2</font> 

In [52]:
# Challenge 2
# ---
# Question: Given the following two datasets, build a model to determine whether a passenger survived or not.
# ---
# Train Dataset url = http://bit.ly/TitanicDatasetTrain
# Test Dataset url = http://bit.ly/TitanicDatasetTest
# ---
# 
titanic = pd.read_csv('http://bit.ly/TitanicDatasetTrain')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [57]:
#drop irrelevant columns
titanic.drop(['PassengerId','Name','Ticket','Cabin'], 1,inplace=True)

In [58]:
#function finds null value and replaces it wth avg age
def age_approx(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [59]:
titanic['Age'] = titanic[['Age', 'Pclass']].apply(age_approx, axis=1)
titanic.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [60]:
gender = pd.get_dummies(titanic['Sex'],drop_first=True)
gender.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [61]:
embark_location = pd.get_dummies(titanic['Embarked'],drop_first=True)
embark_location.head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [62]:
titanic.drop(['Sex', 'Embarked'],axis=1,inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [63]:
titanic_dmy = pd.concat([titanic,gender,embark_location],axis=1)
titanic_dmy.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [64]:
# Splitting our dataset

X = titanic_dmy.drop("Survived",axis=1)
y = titanic_dmy["Survived"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)

In [67]:
# Creating oour Bernoulli Naive Bayes object with prior probabilities of each class
clf = GaussianNB()  
#clf = BernoulliNB()

# Train model
clf.fit(X_train, y_train)

#Making predictions
y_pred = clf.predict(X_test)

#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[138  27]
 [ 28  75]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.83       165
           1       0.74      0.73      0.73       103

    accuracy                           0.79       268
   macro avg       0.78      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268



In [32]:
# Challenge 3
# ---
# Question: Build a model to classify a type of glass given the following dataset.
# ---
# Dataset url = http://bit.ly/GlassDatasetB
# Dataset info:
# Type of glass: (class) 
# -) 1 window glass (from vehicle or building) 
# -) 2 not window glass (containers, tableware, or headlamps)
# ---
# 
glasses = pd.read_csv('http://bit.ly/GlassDatasetB')
glasses

Unnamed: 0,1,1.51824,12.87,3.48,1.29,72.95,0.6,8.43,0,0.1,1.1
0,2,1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0.00,0.00,1
1,3,1.51747,12.84,3.50,1.14,73.27,0.56,8.55,0.00,0.00,1
2,4,1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1
3,5,1.51768,12.65,3.56,1.30,73.08,0.61,8.69,0.00,0.14,1
4,6,1.51769,12.45,2.71,1.29,73.70,0.56,9.06,0.00,0.24,1
...,...,...,...,...,...,...,...,...,...,...,...
194,196,1.52315,13.44,3.34,1.23,72.38,0.60,8.83,0.00,0.00,2
195,197,1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0.00,0.32,1
196,198,1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,1
197,199,1.51905,13.60,3.62,1.11,72.64,0.14,8.76,0.00,0.00,1


## <font color="green">Challenge 4</font> 

In [38]:
# Challenge 4
# ---
# Question: Build a classifier to help determine whether future patients do or do not have heart disease.
# ---
# Dataset url = http://bit.ly/HeartDatasetNB
# 
patients = pd.read_csv('http://bit.ly/HeartDatasetNB')
patients.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [39]:
X = patients.drop('target',axis=1).values
y = patients['target'].values

In [40]:
#split to train and test
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Creating oour Bernoulli Naive Bayes object with prior probabilities of each class
clf = GaussianNB()  

# Train model
clf.fit(X_train, y_train)

#Making predictions
y_pred = clf.predict(X_test)

#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[21  6]
 [ 3 31]]
              precision    recall  f1-score   support

           0       0.88      0.78      0.82        27
           1       0.84      0.91      0.87        34

    accuracy                           0.85        61
   macro avg       0.86      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61

