# <center> Naive Bayes Algorithm For Text Classification

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


# 1. Gauassian Naive Bayes 

* When variables are continuous
* Assumed a normal distribution of variables

### Load the data

In [2]:
iris_data = pd.read_csv('Iris.csv')
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#separate features and target variable
x = iris_data.drop(['Id', 'Species'], axis=1)
y= iris_data['Species']

### Create train and test sets

In [4]:
#create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=56)

### Implement Gaussian Naive Bayes

In [6]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()

In [7]:
#train the model and make predictions
naive_bayes.fit(x_train, y_train)
predictions = naive_bayes.predict(x_test)

In [8]:
# calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.9473684210526315

# 2. Multinomial Naive Bayes

* When the features represent frequency
* Ignores non-occurences of features
* Works with text classification problems

### Load the dataset

In [9]:
tweets_data = pd.read_csv('tweets.csv')
tweets_data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [10]:
#separate features and target variable
x = tweets_data['tweet']
y = tweets_data['label']

### Create train and test sets

In [11]:
#create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=56)

### Create bag-of-words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words = 'english')

In [13]:
# Fit the training data
training_data = count_vector.fit_transform(x_train)
# Transform testing data
testing_data = count_vector.transform(x_test)

In [14]:
training_data

<5940x18541 sparse matrix of type '<class 'numpy.int64'>'
	with 79943 stored elements in Compressed Sparse Row format>

### Implement Multinomial Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

In [16]:
#train model and make predictions
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8873737373737374

# 3. Bernoulli Naive Bayes

* Binary Features
* Penalize non-occurence of features

### Load the dataset

In [18]:
tweets_data = pd.read_csv('tweets.csv')
tweets_data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [19]:
#separate features and target variable
x = tweets_data['tweet']
y = tweets_data['label']

### Create train and test sets

In [20]:
#create train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# Generating Binary Features using countvectorizer
count_vector = CountVectorizer(stop_words = 'english', binary=True)

In [23]:
# Fit the training data 
training_data = count_vector.fit_transform(x_train)

# Transform testing data
testing_data = count_vector.transform(x_test)

### Implement Bernoulli Naive Bayes

In [24]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()

In [25]:
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.882938026013772