# Exploratory Data Analysis - Tic Tac Toe
**Author**: Gabriela Dellamora Paim, Bruno Duarte Carlan

**Version**: 14/04/2025

**Python Ver**: 3.12.9

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Agg parou de funcionar pra mim do nada. Remove essa linha quando for rodar
matplotlib.use('TkAgg')

PATH = './data.csv'
OLD_PATH = 'data.csv'
NEW_PATH = 'data_balanced.csv'

X = 1
O = -1
BLANK = 0

O_WIN   = 0
DRAW    = 1
ONGOING = 2
X_WIN   = 3

In [76]:
# Estatísticas descritivas das colunas (somente para numéricos, target no caso)
df = pd.read_csv(PATH)
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,category
count,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0
mean,0.009634,0.023892,0.013873,0.01657,0.017341,0.015029,-0.006551,0.007322,0.021965,1.625048
std,0.850588,0.833595,0.853922,0.836772,0.866631,0.830791,0.851976,0.837365,0.852623,1.298266
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [77]:
pf = ProfileReport(df)
pf.to_file('report.html')

100%|██████████| 10/10 [00:00<00:00, 200.00it/s]00:01,  9.26it/s, Describe variable: category]
Summarize dataset: 100%|██████████| 19/19 [00:01<00:00, 18.14it/s, Completed]                 
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.55it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 250.02it/s]


In [78]:
print("\nSample do dataset:")
df.sample(30)


Sample do dataset:


Unnamed: 0,0,1,2,3,4,5,6,7,8,category
427,-1,0,1,1,-1,1,-1,0,1,3
1655,-1,1,0,-1,1,-1,0,1,0,3
1868,0,-1,0,1,1,1,0,-1,-1,3
2358,1,0,0,-1,-1,1,-1,0,1,2
1802,1,1,1,0,0,-1,0,-1,-1,3
81,1,1,-1,1,-1,0,1,-1,0,3
1654,-1,1,0,-1,1,-1,1,1,-1,3
2172,-1,0,-1,0,0,0,1,1,0,2
1921,0,0,0,0,0,1,0,0,0,2
2396,1,0,-1,1,0,1,-1,-1,1,2


In [79]:
print("\nDimensões do dataset:")
print(df.shape)


Dimensões do dataset:
(2595, 10)


In [80]:
print("\nValores únicos por coluna:")
for col in df.columns:
    print(f"\nColuna '{col}':")
    print(df[col].value_counts())


Valores únicos por coluna:

Coluna '0':
0
 1    951
-1    926
 0    718
Name: count, dtype: int64

Coluna '1':
1
 1    933
-1    871
 0    791
Name: count, dtype: int64

Coluna '2':
2
 1    964
-1    928
 0    703
Name: count, dtype: int64

Coluna '3':
3
 1    930
-1    887
 0    778
Name: count, dtype: int64

Coluna '4':
4
 1    997
-1    952
 0    646
Name: count, dtype: int64

Coluna '5':
5
 1    915
-1    876
 0    804
Name: count, dtype: int64

Coluna '6':
6
-1    950
 1    933
 0    712
Name: count, dtype: int64

Coluna '7':
7
 1    919
-1    900
 0    776
Name: count, dtype: int64

Coluna '8':
8
 1    972
-1    915
 0    708
Name: count, dtype: int64

Coluna 'category':
category
3    941
0    941
2    681
1     32
Name: count, dtype: int64


In [81]:
# Análise da variável alvo
print("\nDistribuição da variável alvo (category):")
print(df['category'].value_counts(normalize=True))


Distribuição da variável alvo (category):
category
3    0.362620
0    0.362620
2    0.262428
1    0.012331
Name: proportion, dtype: float64


In [82]:
# Visualização da distribuição da variável alvo
plt.figure(figsize=(6, 4))
sns.countplot(x='category', data=df, palette='viridis', hue='category')
plt.title('Distribuição da variável alvo')
plt.legend(['O Venceu', 'Empate', 'Em Progresso', 'X Venceu'])
plt.xlabel('Categoria')
plt.ylabel('Contagem')
plt.show()

In [83]:
cols = df.columns[:-1]
num_cols = len(cols)

# Calcular número de linhas e colunas para o subplot dinamicamente
n_cols = 3
n_rows = (num_cols + n_cols - 1) // n_cols  # Arredondar para cima

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
fig.suptitle('Distribuição dos valores em cada posição do tabuleiro', fontsize=16)

# Se axes for 1D, transformar em 2D para evitar erros
if n_rows == 1:
    axes = [axes]
axes = axes.flatten()

for i, col in enumerate(cols):
    sns.countplot(x=col, hue=col, data=df, ax=axes[i], palette='Set2', legend=False)
    axes[i].set_title(f'Posição {col}')
    axes[i].set_xlabel('Valor')
    axes[i].set_ylabel('Contagem')

# Remover subplots vazios
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [84]:
print("\nCruzamento entre a posição 0 e a variável category:")
cross_tab = pd.crosstab(df['0'], df['category'])
print(cross_tab)


Cruzamento entre a posição 0 e a variável category:
category    0   1    2    3
0                          
-1        433  16  174  303
 0        205   0  308  205
 1        303  16  199  433


In [85]:
# Heatmap para visualizar a correlação entre posições e categoria
plt.figure(figsize=(8, 6))
sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Heatmap - Posição 0 vs Category')
plt.ylabel('Valor na posição 0')
plt.xlabel('Category')
plt.show()

# Balanceamento dos dados

In [86]:

# separa X e y
X = df.drop(columns=["category"])
y = df["category"]

# under-sample para que ninguém tenha > 400
under_strategy = { label: min(count, 400) for label, count in y.value_counts().to_dict().items() }
rus = RandomUnderSampler(sampling_strategy=under_strategy, random_state=42)
X_u, y_u = rus.fit_resample(X, y)

# 3) over-sample para que ninguém tenha < 400
over_strategy = { label: 400 for label in y_u.unique() }
ros = RandomOverSampler(sampling_strategy=over_strategy, random_state=42)
X_res, y_res = ros.fit_resample(X_u, y_u)


df_balanced = pd.DataFrame(X_res, columns=X.columns)
df_balanced["category"] = y_res

print(df_balanced["category"].value_counts())


print("\nDataset após undersampling:")
print(df_balanced["category"].value_counts())

df_balanced.to_csv(NEW_PATH, index=False)
print("Dataset com undersampling salvo como 'data_undersampled.csv'")

category
0    400
1    400
2    400
3    400
Name: count, dtype: int64

Dataset após undersampling:
category
0    400
1    400
2    400
3    400
Name: count, dtype: int64
Dataset com undersampling salvo como 'data_undersampled.csv'
