# Dataset escolhido: https://archive.ics.uci.edu/dataset/158/poker+hand

---


*   O dataset traz várias mãos de poker e sua classificação (CLASS). Prever a classificação de novas mãos pode ser facilmente programado usando lógica determinística, com 100% de taxa de acerto. Escolhi esse dataset para testar a taxa de acerto de um algoritmo probabilístico, como o KNN, em uma tarefa como essa.

---

### Explicação do CLASS:

* 0: Nothing in hand; nenhuma mão de poker reconhecida
* 1: One pair; um par de cartas de mesmo valor entre cinco cartas
* 2: Two pairs; dois pares de cartas de mesmo valor entre cinco cartas
* 3: Three of a kind; três cartas de mesmo valor
* 4: Straight; cinco cartas em sequência, sem intervalos
* 5: Flush; cinco cartas do mesmo naipe
* 6: Full house; um par + uma trinca de valor diferente
* 7: Four of a kind; quatro cartas de mesmo valor
* 8: Straight flush; sequência + mesmo naipe
* 9: Royal flush; {Ás, Rei, Dama, Valete, Dez} + mesmo naipe



In [1]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# essa parte copiei do site do dataset https://archive.ics.uci.edu/dataset/158/poker+hand

# fetch dataset
poker_hand = fetch_ucirepo(id=158)

# data (as pandas dataframes)
X = poker_hand.data.features
y = poker_hand.data.targets

# metadata
print(poker_hand.metadata)

# variable information
print(poker_hand.variables)


{'uci_id': 158, 'name': 'Poker Hand', 'repository_url': 'https://archive.ics.uci.edu/dataset/158/poker+hand', 'data_url': 'https://archive.ics.uci.edu/static/public/158/data.csv', 'abstract': 'Purpose is to predict poker hands', 'area': 'Games', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1025010, 'num_features': 10, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['CLASS'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2002, 'last_updated': 'Sat Mar 09 2024', 'dataset_doi': '10.24432/C5KW38', 'creators': ['Robert Cattral', 'Franz Oppacher'], 'intro_paper': None, 'additional_info': {'summary': 'Each record is an example of a hand consisting of five playing cards drawn from a standard deck of 52. Each card is described using two attributes (suit and rank), for a total of 10 predictive attributes. There is one Class attribute that describes the "Poker Hand". T

In [3]:
# criar dataframe

df = pd.concat([X, y], axis=1)

display(df.head(5))

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,CLASS
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9


In [4]:
pd.set_option("display.float_format", "{:.2f}".format) # tive que colocar isso senao ele tava mostrando em notacao cientifica
display(df.describe())

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,CLASS
count,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0,1025010.0
mean,2.5,7.0,2.5,7.01,2.5,7.0,2.5,7.0,2.5,6.99,0.62
std,1.12,3.74,1.12,3.74,1.12,3.74,1.12,3.74,1.12,3.74,0.77
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,4.0,1.0,4.0,1.0,4.0,2.0,4.0,1.0,4.0,0.0
50%,3.0,7.0,2.0,7.0,3.0,7.0,3.0,7.0,2.0,7.0,0.0
75%,3.0,10.0,4.0,10.0,4.0,10.0,3.0,10.0,4.0,10.0,1.0
max,4.0,13.0,4.0,13.0,4.0,13.0,4.0,13.0,4.0,13.0,9.0


In [5]:
x = df.drop("CLASS", axis=1)
y = df["CLASS"]

print(x.shape, y.shape)

(1025010, 10) (1025010,)


# KNN com Holdout simples (70% treino - 30% teste)

In [6]:
# demora bastante pra rodar, acredito que por o dataset ser grande
# para nao precisar ficar de novo, o resultado foi: Accuracy Holdout: 60.287216710080884%
# com esses parametros padroes aqui: (x, y, test_size=0.3, random_state=42, stratify=y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

acc_holdout = knn.score(x_test, y_test)
print(f'Accuracy Holdout: {acc_holdout*100}%')

Accuracy Holdout: 60.287216710080884%


# KNN com Holdout Repetido (70% treino - 30% teste)

In [None]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

classifier_repetido = KNeighborsClassifier(n_neighbors=3)

cv_shuffle = ShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)

acc_repetido = cross_val_score(classifier_repetido, x, y, cv=cv_shuffle)

print(acc_repetido)
print('------------------------------------------------------------')
print(f'Media Accuracy Holdout Repetido: {acc_repetido.mean()*100:.3f}%')

# KNN com Validacao Cruzada | K-FOLD

In [None]:
acc_kfold = cross_val_score(classifier_repetido, x, y, cv=42)

print(acc_kfold)
print('------------------------------------------------------------')
print(f'Media Accuracy K-Fold: {acc_kfold.mean()*100:.3f}%')