In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('https://raw.githubusercontent.com/MATF-istrazivanje-podataka-1/2023_Data_Mining_Chess_Dataset/main/data/games.csv')

# Nedostajuce vrednosti

In [None]:
data.isna().any().any() #proveravamo ima li nedostajucih vrednosti 

False

In [None]:
data.head()

# Duplirane vrednosti

In [None]:
data.columns
data.drop_duplicates(subset=['id'],inplace=True)
data.shape

(19113, 16)

Obrisali smo sve visetruka ponavljanja u koloni 'id' jer ne smeju postojati dve partije sa istim id-jem.

# Oslobadjanje nepotrebnih atributa

In [None]:
data_dropped=data.drop(columns=['white_id','black_id','id','moves','created_at','last_move_at'])

In [None]:
data_dropped.drop(columns=['opening_name'],inplace=True)

In [None]:
data_dropped.head()

Unnamed: 0,rated,turns,victory_status,winner,increment_code,white_rating,black_rating,opening_eco,opening_ply
0,False,13,outoftime,white,15+2,1500,1191,D10,5
1,True,16,resign,black,5+10,1322,1261,B00,4
2,True,61,mate,white,5+10,1496,1500,C20,3
3,True,61,mate,white,20+0,1439,1454,D02,3
4,True,95,mate,white,30+3,1523,1469,C41,5


In [None]:
print(data_dropped.victory_status.unique())
print(data_dropped.rated.unique())
print(data_dropped.winner.unique())

['outoftime' 'resign' 'mate' 'draw']
[False  True]
['white' 'black' 'draw']


U atributima koji su pregledani u celiji iznad, nema nepredvidjenih vrednosti

In [None]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19113 entries, 0 to 20057
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   rated           19113 non-null  bool  
 1   turns           19113 non-null  int64 
 2   victory_status  19113 non-null  object
 3   winner          19113 non-null  object
 4   increment_code  19113 non-null  object
 5   white_rating    19113 non-null  int64 
 6   black_rating    19113 non-null  int64 
 7   opening_eco     19113 non-null  object
 8   opening_ply     19113 non-null  int64 
dtypes: bool(1), int64(4), object(4)
memory usage: 1.3+ MB


# Transformacija nepogodnih podataka

In [None]:
data_dropped.rated.replace({True: 1, False: 0}, inplace=True)
data_dropped.winner.replace({'white': 1, 'black': 2, 'draw': 0}, inplace=True)
data_dropped.victory_status.replace({'draw': 0, 'outoftime': 1, 'resign': 2, 'mate': 3}, inplace=True)

In [None]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   rated           20058 non-null  int64 
 1   turns           20058 non-null  int64 
 2   victory_status  20058 non-null  int64 
 3   winner          20058 non-null  int64 
 4   increment_code  20058 non-null  object
 5   white_rating    20058 non-null  int64 
 6   black_rating    20058 non-null  int64 
 7   opening_eco     20058 non-null  object
 8   opening_ply     20058 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 1.4+ MB


Sledi transfromacija kolone 'increment_code' sa ciljem da se mnogobrojne string reprezantacije vremenskog modela pod kojim se odigrala partija, organizuje u 4 kategorije

In [None]:
k = data_dropped.increment_code
parovi_vremena = list()
for i in k:
  par = i.split('+') 
  parovi_vremena.append((int(par[0]),int(par[1])))

In [None]:
def add_pair(x):
  return int(x[0])+int(x[1])
def vreme(x):
  if x <=3: return 0
  elif x<=8: return 1
  elif x<=30: return 2
  else: return 3
x=list(map(add_pair,parovi_vremena))
x=list(map(vreme,x))

In [None]:
dd = data_dropped.copy()
n='increment_code'
dd.drop(n,axis=1,inplace=True)
dd.head()
dd[n]=x
dd.head()

Unnamed: 0,rated,turns,victory_status,winner,white_rating,black_rating,opening_eco,opening_ply,increment_code
0,0,13,1,1,1500,1191,D10,5,2
1,1,16,2,2,1322,1261,B00,4,2
2,1,61,3,1,1496,1500,C20,3,2
3,1,61,3,1,1439,1454,D02,3,2
4,1,95,3,1,1523,1469,C41,5,3



*   increment_code: 0 - Bullet, 1 - Blitz, 2 - Standard i 3 - Classical
*   winner: white - 1, Black - 2, Draw - 0
*   rated: 1 - yes, 0 - no
*   victory_status: 0 - Draw, 1 - Out of time, 2 - resign, 3 - mate



In [None]:
print("White min-max:")
print(min(dd.white_rating), max(dd.white_rating))
print()
print("Black min-max:")
print(min(dd.black_rating), max(dd.black_rating))
print()
print("turns min-max:")
print(min(dd.turns), max(dd.turns))

White min-max:
784 2700

Black min-max:
789 2723

turns min-max:
1 349


Vidimo da u ovim atributima nema ekstremnih vrednosti