#AI Group Project ~ Deep Learning

##Prediction of Pumpkin Seed Type



Dataset ini berisi 2.500 data dan 13 features. Terdapat dua jenis biji labu, yaitu 'Urgup_Sivrisi' dan 'Cercevelik' yang umumnya ditanam di daerah Urgup dan Karacaoren di Turki. Project ini dilakukan untuk dapat mengklasifikasi jenis biji labu dengan data-data yang telah ada. Dataset diambil dari link https://www.kaggle.com/datasets/muratkokludataset/pumpkin-seeds-dataset

###Import Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import random
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import csv

###Dataset Visualization

In [None]:
pumpkin_data = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')
pumpkin_data.tail(10)

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
2490,51555,934.911,401.8321,164.7038,52013,256.2067,0.9121,0.9912,0.7187,0.7412,2.4397,0.6376,Ürgüp Sivrisi
2491,69836,1010.605,396.6286,224.7918,70419,298.1911,0.8239,0.9917,0.6693,0.8593,1.7644,0.7518,Ürgüp Sivrisi
2492,84236,1274.656,456.9323,237.154,85248,327.4944,0.8548,0.9881,0.6104,0.6515,1.9267,0.7167,Ürgüp Sivrisi
2493,58987,977.41,404.0779,186.371,59518,274.0522,0.8873,0.9911,0.7327,0.7759,2.1681,0.6782,Ürgüp Sivrisi
2494,79755,1146.431,470.3888,217.8296,80649,318.6647,0.8863,0.9889,0.7175,0.7626,2.1594,0.6774,Ürgüp Sivrisi
2495,79637,1224.71,533.1513,190.4367,80381,318.4289,0.934,0.9907,0.4888,0.6672,2.7996,0.5973,Ürgüp Sivrisi
2496,69647,1084.318,462.9416,191.821,70216,297.7874,0.9101,0.9919,0.6002,0.7444,2.4134,0.6433,Ürgüp Sivrisi
2497,87994,1210.314,507.22,222.1872,88702,334.7199,0.899,0.992,0.7643,0.7549,2.2828,0.6599,Ürgüp Sivrisi
2498,80011,1182.947,501.9065,204.7531,80902,319.1758,0.913,0.989,0.7374,0.7185,2.4513,0.6359,Ürgüp Sivrisi
2499,84934,1159.933,462.8951,234.5597,85781,328.8485,0.8621,0.9901,0.736,0.7933,1.9735,0.7104,Ürgüp Sivrisi


###About Dataset

In [None]:
# Check value yang terdapat pada kolom 'Class'
pumpkin_data['Class'].value_counts()

Çerçevelik       1300
Ürgüp Sivrisi    1200
Name: Class, dtype: int64

In [None]:
# Check null value
pumpkin_data.isnull().sum()

Area                 0
Perimeter            0
Major_Axis_Length    0
Minor_Axis_Length    0
Convex_Area          0
Equiv_Diameter       0
Eccentricity         0
Solidity             0
Extent               0
Roundness            0
Aspect_Ration        0
Compactness          0
Class                0
dtype: int64

In [None]:
# Check zero value
pumpkin_data.isna().sum()

Area                 0
Perimeter            0
Major_Axis_Length    0
Minor_Axis_Length    0
Convex_Area          0
Equiv_Diameter       0
Eccentricity         0
Solidity             0
Extent               0
Roundness            0
Aspect_Ration        0
Compactness          0
Class                0
dtype: int64

###Random Forest

In [None]:
# Membagi dataset ke k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

In [None]:
# Menghitung presentase akurasi
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [None]:
# Mengevaluasi algoritma dengan menggunakan cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		# Input and output for train and test
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

In [None]:
# Membagi dataset berdasarkan atribute dan nilai atribute
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

In [None]:
# Menghitung Gini index untuk membagi dataset
def gini_index(groups, classes):
	# Menghitung semua sample di titik pembagian
	n_instances = float(sum([len(group) for group in groups]))
	# Menjumlahkan Gini index untuk setiap grup
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# Menghindari terjadinya pembagian dengan nilai 0
		if size == 0:
			continue
		score = 0.0
		# Memberi skor kepada grup berdasarkan skor di tiap kelas
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# Menghitung skor tiap grup berdasarkan relatif sizenya
		gini += (1.0 - score) * (size / n_instances)
	return gini

In [None]:
# Memilih titik pembagi terbaik untuk dataset
def get_split(dataset, n_features):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [None]:
# Membuat nilai node terminal
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

In [None]:
# Membuat child splits untuk node atau membuat terminal
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# Cek jika tidak ada splits
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# Cek untuk kedalaman maksimal
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# Proses child sebelah kiri
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# Proses child sebelah kanan
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)

In [None]:
# Membentuk decision tree
def build_tree(train, max_depth, min_size, n_features):
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	return root

In [None]:
# Membuat prediksi dengan decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

In [None]:
# Membuat subsample acak dari dataset dengan pengubahan
def subsample(dataset, ratio):
	sample = list()
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample

In [None]:
# Membuat prediksi dengan menggunakan list dari bagged trees
def bagging_predict(trees, row):
	predictions = [predict(tree, row) for tree in trees]
	return max(set(predictions), key=predictions.count)

In [None]:
# Algoritma random forest
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
	predictions = [bagging_predict(trees, row) for row in test]
	return(predictions)

In [None]:
# Tes algoritma random forest
seed(2)

# Memuat dan mempersiapkan data (upload file dataset)
dataset = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')

# Mengubah kelas kolom ke integer
class_map = {'Çerçevelik': 0, 'Ürgüp Sivrisi': 1}
dataset['Class'] = dataset['Class'].replace(class_map)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Mengatur parameter
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset.columns)-1))

# Menjalankan algoritma dengan jumlah trees yang bervariatif
for n_trees in [1, 3, 5, 10]:
    scores = evaluate_algorithm(dataset.values.tolist(), random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Trees: 1
Scores: [83.6, 85.39999999999999, 85.39999999999999, 86.2, 83.39999999999999]
Mean Accuracy: 84.800%
Trees: 3
Scores: [86.8, 89.0, 87.8, 85.0, 84.8]
Mean Accuracy: 86.680%
Trees: 5
Scores: [87.0, 86.6, 88.8, 86.2, 88.4]
Mean Accuracy: 87.400%
Trees: 10
Scores: [90.2, 87.0, 91.2, 85.6, 86.6]
Mean Accuracy: 88.120%


In [None]:
# Tes algoritma random forest
seed(2)

# Memuat dan mempersiapkan data (upload file dataset)
dataset = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')

# Mengubah kelas kolom ke integer
class_map = {'Çerçevelik': 0, 'Ürgüp Sivrisi': 1}
dataset['Class'] = dataset['Class'].replace(class_map)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Mengatur parameter
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset.columns)-1))

# Melakukan iterasi 5 kali untuk jumlah tree 10
for iter in range(5):
    n_trees = 10
    scores = evaluate_algorithm(dataset.values.tolist(), random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
    print('Iteration: %d' % (iter + 1))
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Iteration: 1
Trees: 10
Scores: [90.60000000000001, 90.2, 87.4, 86.2, 88.2]
Mean Accuracy: 88.520%
Iteration: 2
Trees: 10
Scores: [86.8, 87.6, 91.60000000000001, 89.2, 86.8]
Mean Accuracy: 88.400%
Iteration: 3
Trees: 10
Scores: [85.6, 89.0, 90.8, 86.6, 87.8]
Mean Accuracy: 87.960%
Iteration: 4
Trees: 10
Scores: [89.4, 88.8, 88.0, 86.6, 88.0]
Mean Accuracy: 88.160%
Iteration: 5
Trees: 10
Scores: [90.0, 87.4, 86.6, 88.0, 89.2]
Mean Accuracy: 88.240%


In [None]:
# Tes algoritma random forest
seed(2)

# Memuat dan mempersiapkan data (upload file dataset)
dataset = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')

# Mengubah kelas kolom ke integer
class_map = {'Çerçevelik': 0, 'Ürgüp Sivrisi': 1}
dataset['Class'] = dataset['Class'].replace(class_map)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Menjalankan algoritma
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset.columns)-1))

# Melakukan iterasi 5 kali untuk jumlah tree 3
for iter in range(5):
    n_trees = 3
    scores = evaluate_algorithm(dataset.values.tolist(), random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
    print('Iteration: %d' % (iter + 1))
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Iteration: 1
Trees: 3
Scores: [87.8, 86.4, 88.8, 86.4, 86.8]
Mean Accuracy: 87.240%
Iteration: 2
Trees: 3
Scores: [87.0, 87.2, 86.2, 87.6, 87.8]
Mean Accuracy: 87.160%
Iteration: 3
Trees: 3
Scores: [87.2, 87.2, 86.2, 84.0, 89.0]
Mean Accuracy: 86.720%
Iteration: 4
Trees: 3
Scores: [85.0, 89.2, 87.2, 85.6, 85.2]
Mean Accuracy: 86.440%
Iteration: 5
Trees: 3
Scores: [87.4, 90.0, 85.0, 84.8, 86.8]
Mean Accuracy: 86.800%
