# **SKLEARN**

In [71]:
import pandas as pd
msg=pd.read_csv('data.csv',names=['message','label'])
print('Total instances of the dataset:',msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
print('The message and its label of first 5 instances are listed below')
X5, Y5 =X[0:5], msg.label[0:5]
for x, y in zip(X5,Y5):
  print(x, ',', y)

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X, Y)
print('Dataset is split into Training and Testing samples')
print ('the total number of Training Data :',xtrain.shape[0])
print ('the total number of Test Data :',xtest.shape[0])
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
xtrain_dtm = cv.fit_transform(xtrain)
xtest_dtm=cv.transform(xtest)
print('Total features extracted using CountVectorizer:',xtrain_dtm.shape[1])
print('Features for first 5 training instances are listed below')
df=pd.DataFrame(xtrain_dtm.toarray(),columns=cv.get_feature_names())
print(df[0:5])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm, ytrain)
predicted = clf.predict(xtest_dtm)
print('Classification results of testing samples are given below')
for doc,p in zip(xtest, predicted):
  pred = 'pos' if p==1 else 'neg'
  print('%s-> %s'%(doc,pred))
from sklearn import metrics
print('Accuracy metrics')
print('Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('The value of Precision', metrics.precision_score(ytest,predicted))
print('The value of Recall', metrics.recall_score(ytest,predicted))
print('Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))

Total instances of the dataset: 33
The message and its label of first 5 instances are listed below
This is a nice restaurant , pos
I wrote the test well , pos
I love playing , pos
This is a good painting , pos
I did a good thing , pos
Dataset is split into Training and Testing samples
the total number of Training Data : 24
the total number of Test Data : 9
Total features extracted using CountVectorizer: 40
Features for first 5 training instances are listed below
   am  amazing  an  anything  are  being  cannot  care  did  do  ...  she  \
0   0        0   0         0    0      0       0     0    0   0  ...    0   
1   1        0   0         0    0      0       0     0    0   0  ...    0   
2   0        0   0         0    0      0       0     0    1   0  ...    0   
3   0        0   0         0    0      0       0     0    0   0  ...    0   
4   0        0   1         0    1      0       0     0    0   0  ...    0   

   still  talking  thing  this  to  well  what  why  you  
0      0   

# **GAUSSIAN NAIVE BAYES**





```
Step 1: Separate By Class.
Step 2: Summarize Dataset.
Step 3: Summarize Data By Class.
Step 4: Gaussian Probability Density Function.
Step 5: Class Probabilities.

		Bayes Theorem:
										              Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											                  Predictor prior probability
				
							  			       P(x|c) * p(c)
							   P(c|x) = ------------------ 
											            P(x)
		Gaussian Naive Bayes:
							         1								
				P(x|c) = --------------------------- * exp(- (x - mean)^2 / 2*(var(x)^2)))
						   sqrt(2 * pi * var(x)^2)
```
Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)
Where P(class|data) is the probability of class given the provided data.



[Reference for scratch implementation](https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/)

In [18]:
from math import sqrt
from math import pi
from math import exp

**Step 1: Separate By Class.**

---



In [19]:
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

In [20]:
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
separated = separate_by_class(dataset)
for label in separated:
	print(label)
	for row in separated[label]:
		print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


**Step 2: Summarize Dataset**

---



In [21]:
from math import sqrt

# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
#  remove the statistics for the class variable as we will not need these statistics
	return summaries

In [22]:
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summary = summarize_dataset(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


**Step 3: Summarize Data By Class.**

---

In [23]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

In [24]:
# Test summarizing by class
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summary = summarize_by_class(dataset)
for label in summary:
	print(label)
	for row in summary[label]:
		print(row)

0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


**Step 4: Gaussian Probability Density Function.**

---


In [25]:
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [26]:
print(calculate_probability(1.0, 1.0, 1.0))
print(calculate_probability(2.0, 1.0, 1.0))
print(calculate_probability(0.0, 1.0, 1.0))

0.3989422804014327
0.24197072451914337
0.24197072451914337


**Step 5: Class Probabilities.**

---



P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)

In [27]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, count = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

In [28]:
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)

{0: 0.05032427673372076, 1: 0.00011557718379945765}


In [29]:
import pandas as pd
from math import sqrt
from random import seed,randrange
from sklearn.metrics import classification_report,confusion_matrix

In [30]:
def str_column_to_float(dataset,column):
  for row in dataset:
    if(isinstance(row[column],str)):
      row[column]=float(row[column].strip())

In [31]:
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

In [32]:
data = pd.read_csv('iris.csv')
data=data.values.tolist()
for i in range(len(data[0])-1):
	str_column_to_float(data, i)
# convert class column to integers
str_column_to_int(data, len(data[0])-1)
print(data[0])

[4.9, 3.0, 1.4, 0.2, 0]


In [33]:
def accuracy_metric(actual,predicted):
  correct=0
  for i in range(len(actual)):
    if actual[i]==predicted[i]:
      correct+=1
  print(confusion_matrix(actual,predicted))
  print(classification_report(actual,predicted))
  return correct/float(len(actual))*100.0

In [34]:
def cross_validation_split(data, folds=3):
	dataset_split = []
	dataset_copy = data
	fold_size = int(len(data) / folds)
	for i in range(folds):
		fold = []
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split 

In [35]:

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label
 
# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

In [36]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = naive_bayes(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

In [37]:
n_folds = 5
scores = evaluate_algorithm(data,naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

[[12  0  0]
 [ 0  9  0]
 [ 0  0  8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         8

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29

[[ 9  0  0]
 [ 0 11  0]
 [ 0  0  9]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00         9

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29

[[ 9  0  0]
 [ 0 10  2]
 [ 0  2  6]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1      

In [38]:
model = summarize_by_class(dataset)
# define a new record
row = [5.7,2.9,4.2,1.3]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1


# **BINOMIAL NAIVE BAYES**

```
Bayes Theorem:
										              Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											                Predictor prior probability
				
							  			     P(x|c) * p(c)
							   P(c|x) = ------------------ 
											          P(x)

```



In [41]:

import pandas as pd
data = pd.read_csv('iris.csv')
data=data.values.tolist()
for i in range(len(data[0])-1):
	str_column_to_float(data, i)
# convert class column to integers
str_column_to_int(data, len(data[0])-1)

dataset=data

In [42]:
import numpy as np
dataset = [
           [0,0,1,0,0],
           [0,0,1,1,0],
           [1,0,1,0,1],
           [2,1,1,0,1],
           [2,2,0,0,1],
           [2,2,0,1,0],
           [1,2,0,1,1],
           [0,1,1,0,0],
           [0,2,0,0,1],
           [2,1,0,0,1],
           [0,1,0,1,1],
           [1,1,1,1,1],
           [1,0,0,0,1],
           [2,1,1,1,0]
           ]
mp = dict()
for i in range(len(dataset)):
    row = dataset[i]
    y = row[-1]
    if (y not in mp):
        mp[y] = list()
    mp[y].append(row)

test = [2,3,1,0]

probYes = 1

count = 0
total = 0
for row in dataset:
    if(row[-1] == 1):
        count+=1
    total+=1
print("Total yes: "+str(count)+" / "+str(total))
probYes = count/total
l=[]
for i in range(len(test)):
    count = 0
    total = 0
    for row in mp[1]:
        if(test[i] == row[i]):
            count += 1
    l.append(count)

    print('for feature '+str(i+1))
    print(str(count)+" / "+str(total))
  
ll=[i+1 for i in l if 0 in l]

for i in ll:
  probYes *= i/len(mp[1])

probNo = 1
count = 0
total = 0
for row in dataset:
    if(row[-1] == 0):
        count+=1
    total+=1
probNo = count/total
print("Total no: "+str(count)+" / "+str(total))
# for i in range(len(test)):
#     count = 0
#     total = 0
#     for row in mp[0]:
#         if(test[i] == row[i]):
#             count += 1
#         total += 1
#     print('for feature '+str(i+1))
#     print(str(count)+" / "+str(total))
#     probNo *= count/total

l=[]
for i in range(len(test)):
    count = 0
    total = 0
    for row in mp[0]:
        if(test[i] == row[i]):
            count += 1
    l.append(count)

    print('for feature '+str(i+1))
    print(str(count)+" / "+str(total))
  
ll=[i+1 for i in l if 0 in l]

for i in ll:
  probNo *= i/len(mp[1])

print(probYes)
print(probNo)

prob = probYes/(probYes+probNo)
print("Probability of playing golf: "+str(prob*100)+"%")

Total yes: 9 / 14
for feature 1
3 / 0
for feature 2
0 / 0
for feature 3
3 / 0
for feature 4
6 / 0
Total no: 5 / 14
for feature 1
2 / 0
for feature 2
0 / 0
for feature 3
4 / 0
for feature 4
2 / 0
0.010973936899862825
0.0024495394865765236
Probability of playing golf: 81.75182481751824%


# **BINOMIAL NAIVE BAYES**





```

		Bayes Theorem:
										              Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											                  Predictor prior probability
				
							  			       P(x|c) * p(c)
							   P(c|x) = ------------------ 
											            P(x)
```
Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)
Where P(class|data) is the probability of class given the provided data.



P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)

In [56]:
def separated_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

In [57]:
def calculate_class_probabilities(dataset,summaries,row):
  for label in summaries:
    prob=1
    prob = len(summaries[label])/len(dataset)
    labl=0
    value=0
    likehood=[]
    for i in range(len(row)-1):
      count=0
      for r in summaries[label]:
        if(r[i]==row[i]):
          count+=1
        likehood.append(count)
      # print(f'for feature {i} {count}/{len(summaries[label])}')
    likehood_update=[i+1 for i in likehood if 0 in likehood]
    if 0 in likehood:
      for i in likehood_update:
        prob*=i/(len(summaries[label]))
    else:
      for i in likehood_update:
        prob*=i/sum(likehood_update)
    if(prob>value):

      value=prob
      labl=label
  
  return labl

In [58]:
import pandas as pd
from math import sqrt
from random import seed,randrange
from sklearn.metrics import classification_report,confusion_matrix

In [59]:
def str_column_to_float(dataset,column):
  for row in dataset:
    if(isinstance(row[column],str)):
      row[column]=float(row[column].strip())

In [60]:
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

In [61]:
data=[
           [0,0,1,0,0],
           [0,0,1,1,0],
           [1,0,1,0,1],
           [2,1,1,0,1],
           [2,2,0,0,1],
           [2,2,0,1,0],
           [1,2,0,1,1],
           [0,1,1,0,0],
           [0,2,0,0,1],
           [2,1,0,0,1],
           [0,1,0,1,1],
           [1,1,1,1,1],
           [1,0,0,0,1],
           [2,1,1,1,0]
           ]


In [62]:
def accuracy_metric(actual,predicted):
  print(actual,"---------------\n",predicted)
  correct=0
  for i in range(len(actual)):
    if actual[i]==predicted[i]:
      correct+=1
  # print(confusion_matrix(actual,predicted))
  # print(classification_report(actual,predicted))
  return correct/float(len(actual))*100.0

In [63]:
def cross_validation_split(data, folds=3):
	dataset_split = []
	dataset_copy = data
	fold_size = int(len(data) / folds)
	for i in range(folds):
		fold = []
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split 

In [64]:

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = separated_by_class(train)
	predictions = list()
	for row in test:
		output = calculate_class_probabilities(train,summarize, row)
		predictions.append(output)
	return(predictions)

In [65]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	print(dataset)
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			# row_copy[-1] = None
		print(train_set)
		print(test_set)
		predicted = naive_bayes(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

In [66]:

import pandas as pd
data = pd.read_csv('iris.csv')
data=data.values.tolist()
for i in range(len(data[0])-1):
	str_column_to_float(data, i)
# convert class column to integers
str_column_to_int(data, len(data[0])-1)

dataset=data

n_folds = 5
scores = evaluate_algorithm(data,naive_bayes, n_folds)
print('Scores: %s' % scores)
# print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

[[4.9, 3.0, 1.4, 0.2, 0], [4.7, 3.2, 1.3, 0.2, 0], [4.6, 3.1, 1.5, 0.2, 0], [5.0, 3.6, 1.4, 0.2, 0], [5.4, 3.9, 1.7, 0.4, 0], [4.6, 3.4, 1.4, 0.3, 0], [5.0, 3.4, 1.5, 0.2, 0], [4.4, 2.9, 1.4, 0.2, 0], [4.9, 3.1, 1.5, 0.1, 0], [5.4, 3.7, 1.5, 0.2, 0], [4.8, 3.4, 1.6, 0.2, 0], [4.8, 3.0, 1.4, 0.1, 0], [4.3, 3.0, 1.1, 0.1, 0], [5.8, 4.0, 1.2, 0.2, 0], [5.7, 4.4, 1.5, 0.4, 0], [5.4, 3.9, 1.3, 0.4, 0], [5.1, 3.5, 1.4, 0.3, 0], [5.7, 3.8, 1.7, 0.3, 0], [5.1, 3.8, 1.5, 0.3, 0], [5.4, 3.4, 1.7, 0.2, 0], [5.1, 3.7, 1.5, 0.4, 0], [4.6, 3.6, 1.0, 0.2, 0], [5.1, 3.3, 1.7, 0.5, 0], [4.8, 3.4, 1.9, 0.2, 0], [5.0, 3.0, 1.6, 0.2, 0], [5.0, 3.4, 1.6, 0.4, 0], [5.2, 3.5, 1.5, 0.2, 0], [5.2, 3.4, 1.4, 0.2, 0], [4.7, 3.2, 1.6, 0.2, 0], [4.8, 3.1, 1.6, 0.2, 0], [5.4, 3.4, 1.5, 0.4, 0], [5.2, 4.1, 1.5, 0.1, 0], [5.5, 4.2, 1.4, 0.2, 0], [4.9, 3.1, 1.5, 0.1, 0], [5.0, 3.2, 1.2, 0.2, 0], [5.5, 3.5, 1.3, 0.2, 0], [4.9, 3.1, 1.5, 0.1, 0], [4.4, 3.0, 1.3, 0.2, 0], [5.1, 3.4, 1.5, 0.2, 0], [5.0, 3.5, 1.3, 0.3, 0],

In [70]:
# import csv
# import random
# import math
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
 
# def loadcsv(dataset):
# 	# lines = csv.reader(open(filename, "r"));
# 	# dataset = list(data)
# 	for i in range(len(dataset)):
#        #converting strings into numbers for processing
# 		dataset[i] = [float(x) for x in dataset[i]]
        
# 	return dataset
 
# def splitdataset(dataset, splitratio):
#     #67% training size
# 	trainsize = int(len(dataset) * splitratio);
# 	trainset = []
# 	copy = list(dataset);    
# 	while len(trainset) < trainsize:
# #generate indices for the dataset list randomly to pick ele for training data
# 		index = random.randrange(len(copy));       
# 		trainset.append(copy.pop(index))    
# 	return [trainset, copy]
 
# def separatebyclass(dataset):
# 	separated = {} #dictionary of classes 1 and 0 
# #creates a dictionary of classes 1 and 0 where the values are 
# #the instances belonging to each class
# 	for i in range(len(dataset)):
# 		vector = dataset[i]
# 		if (vector[-1] not in separated):
# 			separated[vector[-1]] = []
# 		separated[vector[-1]].append(vector)
# 	return separated
 
# def mean(numbers):
# 	return sum(numbers)/float(len(numbers))
 
# def stdev(numbers):
# 	avg = mean(numbers)
# 	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
# 	return math.sqrt(variance)
 
# def summarize(dataset): #creates a dictionary of classes
# 	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
# 	del summaries[-1] #excluding labels +ve or -ve
# 	return summaries
 
# def summarizebyclass(dataset):
# 	separated = separatebyclass(dataset); 
#     #print(separated)
# 	summaries = {}
# 	for classvalue, instances in separated.items(): 
# #for key,value in dic.items()
# #summaries is a dic of tuples(mean,std) for each class value        
# 		summaries[classvalue] = summarize(instances) #summarize is used to cal to mean and std
# 	return summaries
 
# def calculateprobability(x, mean, stdev):
# 	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
# 	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
 
# def calculateclassprobabilities(summaries, inputvector):
# 	probabilities = {} # probabilities contains the all prob of all class of test data
# 	for classvalue, classsummaries in summaries.items():#class and attribute information as mean and sd
# 		probabilities[classvalue] = 1
# 		for i in range(len(classsummaries)):
# 			mean, stdev = classsummaries[i] #take mean and sd of every attribute for class 0 and 1 seperaely
# 			x = inputvector[i] #testvector's first attribute
# 			probabilities[classvalue] *= calculateprobability(x, mean, stdev);#use normal dist
# 	return probabilities
			
# def predict(summaries, inputvector): #training and test data is passed
# 	probabilities = calculateclassprobabilities(summaries, inputvector)
# 	bestLabel, bestProb = None, -1
# 	for classvalue, probability in probabilities.items():#assigns that class which has he highest prob
# 		if bestLabel is None or probability > bestProb:
# 			bestProb = probability
# 			bestLabel = classvalue
# 	return bestLabel
 
# def getpredictions(summaries, testset):
# 	predictions = []
# 	for i in range(len(testset)):
# 		result = predict(summaries, testset[i])
# 		predictions.append(result)
# 	return predictions
 
# def getaccuracy(testset, predictions):
# 	correct = 0
# 	for i in range(len(testset)):
# 		if testset[i][-1] == predictions[i]:
# 			correct += 1
# 	return (correct/float(len(testset))) * 100.0
 
# def main():
# 		msg=pd.read_csv('data.csv',names=['message','label'])
# 		msg['labelnum']=msg.label.map({'pos':1,'neg':0})
# 		X=msg.message
# 		Y=list(msg.label)
# 		print(list(Y))
# 		vectorizer = CountVectorizer()
# 		matrix = vectorizer.fit_transform(X)
# 		filename = 'naviedata.csv'
# 		splitratio = 0.67
# 		df = pd.DataFrame(data=matrix.toarray(),columns = vectorizer.get_feature_names())
		
# 		df["result"]=Y
# 		dataset=df.values.tolist()
# 		dataset=loadcsv(dataset)
# 		trainingset, testset = splitdataset(dataset, splitratio) 
# 		print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingset), len(testset)))
# 		# prepare model
# 		summaries = summarizebyclass(trainingset);    
# 		#print(summaries)
# 			# test model
# 		predictions = getpredictions(summaries, testset) #find the predictions of test data with the training data
# 		accuracy = getaccuracy(testset, predictions)
# 		print('Accuracy of the classifier is : {0}%'.format(accuracy),"\n",dataset)
  
 
# main()