Done with pre procesing and output for all


In [32]:
# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from sklearn import preprocessing
import numpy as np

# Load a CSV file
def load_csv(filename):
	csv_reader =  pd.read_csv(filename,sep=',',header= None)
	# Label encoding if need in the data sets
	d= list(csv_reader.select_dtypes(include=['category','object']))
	label_encoder = preprocessing.LabelEncoder()
	for i in d:
		csv_reader[i] = label_encoder.fit_transform(csv_reader[i])
	csv_reader = csv_reader.astype(str)
	dataset = csv_reader.values.tolist()
	return dataset
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	try:
		exponent = exp(-((x-mean)**2 / (2 * stdev**2 ))) 
	except ZeroDivisionError:
		exponent = 0
	return ((1 / (sqrt(2 * pi) * stdev))) * exponent if exponent != 0 else 0

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

# Test Naive Bayes on Iris Dataset
seed(1)
filename = 'C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/breast-cancer.data'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [77.19298245614034, 63.1578947368421, 80.7017543859649, 85.96491228070175, 78.94736842105263]
Mean Accuracy: 77.193%


Code with 10 folds

In [62]:
# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from sklearn import preprocessing
import numpy as np

# Load a CSV file
def load_csv(filename):
	csv_reader =  pd.read_csv(filename,sep=',',header= None)
	# Label encoding if need in the data sets
	d= list(csv_reader.select_dtypes(include=['category','object']))
	label_encoder = preprocessing.LabelEncoder()
	for i in d:
		csv_reader[i] = label_encoder.fit_transform(csv_reader[i])
	csv_reader = csv_reader.astype(str)
	dataset = csv_reader.values.tolist()
	return dataset
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	try:
		exponent = exp(-((x-mean)**2 / (2 * stdev**2 ))) 
	except ZeroDivisionError:
		exponent = 0
	return ((1 / (sqrt(2 * pi) * stdev))) * exponent if exponent != 0 else 0

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

# Test Naive Bayes on Iris Dataset
seed(1)
filename = 'C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/breast-cancer.data'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [75.0, 78.57142857142857, 60.71428571428571, 67.85714285714286, 75.0, 92.85714285714286, 89.28571428571429, 78.57142857142857, 82.14285714285714, 75.0]
Mean Accuracy: 77.500%


Enhancement

Using Log

In [59]:
# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from sklearn import preprocessing
import numpy as np

# Load a CSV file
def load_csv(filename):
	csv_reader =  pd.read_csv(filename,sep=',',header= None)
	# Label encoding if need in the data sets
	d= list(csv_reader.select_dtypes(include=['category','object']))
	label_encoder = preprocessing.LabelEncoder()
	for i in d:
		csv_reader[i] = label_encoder.fit_transform(csv_reader[i])
	csv_reader = csv_reader.astype(str)
	dataset = csv_reader.values.tolist()
	return dataset
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	try:
		exponent = exp(-((x-mean)**2 / (2 * stdev**2 ))) 
	except ZeroDivisionError:
		exponent = 0
	return np.log(((1 / (sqrt(2 * pi) * stdev))) * exponent) if exponent != 0 else 0

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

# Test Naive Bayes on Iris Dataset
seed(1)
filename = 'C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/car.data'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [40.69767441860465, 37.2093023255814, 42.44186046511628, 45.93023255813954, 47.674418604651166, 45.348837209302324, 43.604651162790695, 44.76744186046512, 42.44186046511628, 43.02325581395349]
Mean Accuracy: 43.314%


Using Bernoulli

Hayes better
Car better 92.22
brest No better 76.84


In [41]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
dataset = pd.read_csv('C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/breast-cancer.data', sep=',')

# dataset
d= list(dataset.select_dtypes(include=['category','object']))
label_encoder = preprocessing.LabelEncoder()
for i in d:
	dataset[i] = label_encoder.fit_transform(dataset[i])
xDataSet= dataset
# yDataSet = dataset['unacc']
yDataSet = dataset['no.1']
# yDataSet = dataset['1']

xtrain,xtest,ytrain,ytest = train_test_split(xDataSet, yDataSet, test_size=0.33, random_state=42)

model = BernoulliNB(binarize= True)
model.fit(xtrain,ytrain)
print(model)
prediction = model.predict(xtest)
print(accuracy_score(ytest,prediction))

BernoulliNB(binarize=True)
0.7684210526315789


Using Mutlinomial
brest cancer 89.94
car 91.22
Hayes Roth 43.18


In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
dataset = pd.read_csv('C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/breast-cancer.data', sep=',')


# dataset
d= list(dataset.select_dtypes(include=['category','object']))
label_encoder = preprocessing.LabelEncoder()
for i in d:
	dataset[i] = label_encoder.fit_transform(dataset[i])
xDataSet= dataset
# yDataSet = dataset['unacc']
yDataSet = dataset['no.1']
# yDataSet = dataset['1']


xtrain,xtest,ytrain,ytest = train_test_split(xDataSet, yDataSet, test_size=0.33, random_state=42)

model = MultinomialNB()
model.fit(xtrain,ytrain)
print(model)
prediction = model.predict(xtest)
print(accuracy_score(ytest,prediction))

MultinomialNB()
0.8947368421052632


In [7]:
import pandas as pd
df=pd.read_csv('C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/hayes-roth.data')
df.to_csv('C:/Users/16824/OneDrive/Desktop/Project/ML/Assignment1/hayes-roth.csv',index= False)


In [63]:
# Example of the Student's t-test
from scipy.stats import ttest_ind
#considering weka mean of valid instance weka instance mean hayes= 80.916,
#Hayes-roth
# code = [92.3076923076923, 69.23076923076923, 84.61538461538461, 76.92307692307693, 69.23076923076923, 92.3076923076923, 53.84615384615385, 76.92307692307693, 84.61538461538461, 69.23076923076923]
# weka = [80.916,80.916,80.916,80.916,80.916,80.916,80.916,80.916,80.916,80.916]
#car
# code = 	 [76.16279069767442, 72.09302325581395, 73.25581395348837, 72.67441860465115, 75.0, 75.0, 71.51162790697676, 79.65116279069767, 73.83720930232558, 74.4186046511628]
# weka = [70.005,70.005,70.005,70.005,70.005,70.005,70.005,70.005,70.005]
#Breast Cancer
code = [75.0, 78.57142857142857, 60.71428571428571, 67.85714285714286, 75.0, 92.85714285714286, 89.28571428571429, 78.57142857142857, 82.14285714285714, 75.0]
weka = [74.7368,74.7368,74.7368,74.7368,74.7368,74.7368,74.7368,74.7368,74.7368,74.7368]
stat, p = ttest_ind(code, weka)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

stat=0.931, p=0.364
Probably the same distribution
