In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
import re

In [3]:
df = pd.read_csv("cleaner_data.csv")
print(df.size)
df.head()

88767


Unnamed: 0.1,Unnamed: 0,name,multiverse_id,color_identity,text,flavor,power
0,1,Ancestor's Chosen,,['W'],First strike (This creature deals combat damag...,"""The will of all, by my hand done.""",4.0
1,2,Angel of Mercy,129465.0,['W'],Flying\r\nWhen Angel of Mercy enters the battl...,Every tear shed is a drop of immortality.,3.0
2,4,Angelic Blessing,129711.0,['W'],Target creature gets +3/+3 and gains flying un...,Only the warrior who can admit mortal weakness...,
3,6,Angelic Chorus,129710.0,['W'],Whenever a creature enters the battlefield und...,The harmony of the glorious is a dirge to the ...,
4,7,Angelic Wall,129671.0,['W'],Defender (This creature can't attack.)\r\nFlying,"""The Ancestor protects us in ways we can't beg...",0.0


In [4]:
import re

def clean_color(text):
     # as cores são branco(W), preto(B), azul(U), vermelho(R) ou verde(G)
    pattern = r"\['(W|B|U|R|G)'\]"
    
    match = re.search(pattern, text)
    
    if match:
        return match.group(1)
    return None

df["color"] = df["color_identity"].apply(clean_color)
df.drop("color_identity", axis=1, inplace = True)

In [5]:
df["color"].value_counts()

color
W    2632
R    2575
B    2563
G    2475
U    2436
Name: count, dtype: int64

In [6]:
df["counterspell"] = (df.text.str.contains(r"[Cc]ounter\s(?:it|target|all)") & # anulações
         (df.text.str.contains(r"[wW]ard(?:\s{|—])") == False)) # Ward é uma maneira de anulação/proteção mais transversal às cores e é ignorada


df["exile"] = (df.text.str.contains(r"[eE]xile\s(?:target|each|all|the|up\sto)") & # exilar algo do tablueiro
        (df.text.str.contains(r"the\stop") == False)) # Algumas cartas "compram" cartas exilando-as do topo do deck, isso nào conta como exilar algo do tabuleiro

df["fight"] = (df.text.str.contains(r"[Ff]ights")) # Criaturas lutarem é uma mecânica característica do verde


df["mill"] = (df.text.str.contains(r"[mM]ill")) # TODO: mill é uma palavra chave recente, antes era referida por descartar a carta do >TOPO DO BATALHO<, bom adicionar este texto também


df["scry"] = (df.text.str.contains(r"[sS]cry"))


df["tap"] = (df.text.str.contains(r"(?:\st|T)ap\s(?:it|target|each|all|or\suntap)")) # cartas que viram outras 


df["untap"] = (df.text.str.contains(r"[uU]ntap\s(?:it|target|each|all)")) # find untappers

"""
df.loc[df.text.str.contains("[dD]eathtouch") | # find creatures that have deathtouch
        df.text.str.contains("deals combat damage to a creature, destroy that creature", regex = False)] # or that have "derptouch"
"""

df["double_strike"] = (df.text.str.contains(r"[dD]ouble\sstrike"))


df["first_strike"] = (df.text.str.contains(r"[fF]irst\sstrike"))


df["flash"] = (df.text.str.contains(r"(?:f|\nF|^F)lash") & # some engineering to avoid incorrectly grabbing cards with Flash in the name
        (df.text.str.contains(r"[fF]lashback") == False)) # dont' want to capture flashback


df["flying"] = (df.text.str.contains(r"[fF]lying"))


df["haste"] = (df.text.str.contains(r"[hH]aste"))


df["hexproof"] = (df.text.str.contains(r"[hH]exproof"))


df["indestructible"] = (df.text.str.contains(r"[iI]ndestructible") &
                         (df.text.str.contains(r"loses\sindestructible") == False))


df["sacrifice"] = (df.text.str.contains(r"[sS]acrifice"))


df["lifelink"] = (df.text.str.contains(r"[lL]ifelink"))


df["menace"] = (df.text.str.contains(r"[mM]enace"))


df["protection"] = (df.text.str.contains(r"[pP]rotection\sfrom"))


df["prowess"] = (df.text.str.contains(r"[pP]rowess"))


df["reach"] = (df.text.str.contains(r"(?:\sr|\nR|^R)each") &
        (df.text.str.contains(r"can't be blocked except by creatures with flying or reach", regex = False) == False)) # don't want flying reminder text


df["trample"] = (df.text.str.contains(r"[tT]rample"))


df["vigilance"] = (df.text.str.contains(r"[vV]igilance"))


df["draw"] = (df.text.str.contains(r"(?:\sd|\nD|^D)raw"))


df["discard"] = (df.text.str.contains(r"[dD]iscard"))


df["damage"] = (df.text.str.contains(r"deals\s\d\sdamage"))


df["damage_prevention"] = (df.text.str.contains(r"[pP]revent\s"))


df["life_gain"] = (df.text.str.contains(r"gain(?:\s|s\s)\d+\slife"))


df["life_loss"] = (df.text.str.contains(r"loses") & 
                   df.text.str.contains(r"(?:their|\d+)\slife")) # capture both fixed and rational values


df["tokens"] = (df.text.str.contains(r"[cC]reate"))


df["destroy"] = (df.text.str.contains(r"[dD]estroy") &
                  (df.text.str.contains(r"don't\sdestroy\sit.") == False)) # reject indestructible's reminder text


df["return"] = (df.text.str.contains(r"[rR]eturn") &
        df.text.str.contains(r"owner's\s(?:hand|library)") & # capture hand or library bounce effects
        (df.text.str.contains(r"graveyard\sto") == False)) # exclude grave recursion


df["recursion"] = (df.text.str.contains(r"\sput|return") &
        df.text.str.contains(r"graveyard")&
        df.text.str.contains(r"hand|battlefield"))

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,multiverse_id,text,flavor,power,color,counterspell,exile,fight,...,draw,discard,damage,damage_prevention,life_gain,life_loss,tokens,destroy,return,recursion
0,1,Ancestor's Chosen,,First strike (This creature deals combat damag...,"""The will of all, by my hand done.""",4.0,W,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2,Angel of Mercy,129465.0,Flying\r\nWhen Angel of Mercy enters the battl...,Every tear shed is a drop of immortality.,3.0,W,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,4,Angelic Blessing,129711.0,Target creature gets +3/+3 and gains flying un...,Only the warrior who can admit mortal weakness...,,W,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,6,Angelic Chorus,129710.0,Whenever a creature enters the battlefield und...,The harmony of the glorious is a dirge to the ...,,W,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,7,Angelic Wall,129671.0,Defender (This creature can't attack.)\r\nFlying,"""The Ancestor protects us in ways we can't beg...",0.0,W,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Exibir todas as colunas e amostras de valores únicos em texto
print("\nNomes das colunas e valores únicos:")
for coluna in df.columns:
    print(f"\nColuna: {coluna}")
    print(df[coluna].unique())


Nomes das colunas e valores únicos:

Coluna: Unnamed: 0
[    1     2     4 ... 76228 76231 76232]

Coluna: name
["Ancestor's Chosen" 'Angel of Mercy' 'Angelic Blessing' ...
 'Vastwood Fortification // Vastwood Thicket' 'Veteran Adventurer'
 'Vine Gecko']

Coluna: multiverse_id
[    nan 129465. 129711. ... 491866. 491869. 491870.]

Coluna: text
["First strike (This creature deals combat damage before creatures without first strike.)\r\nWhen Ancestor's Chosen enters the battlefield, you gain 1 life for each card in your graveyard."
 'Flying\r\nWhen Angel of Mercy enters the battlefield, you gain 3 life.'
 "Target creature gets +3/+3 and gains flying until end of turn. (It can't be blocked except by creatures with flying or reach.)"
 ... 'When Turntimber Ascetic enters the battlefield, you gain 3 life.'
 'Veteran Adventurer is also a Cleric, Rogue, Warrior, and Wizard.\r\nThis spell costs {1} less to cast for each creature in your party.\r\nVigilance'
 'The first kicked spell you cast ea

In [11]:
# Função para limpar e tokenizar o texto
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove pontuação e converte para minúsculas
    return word_tokenize(text)

# Tokenizar a coluna 'flavor'
tokenized_text = df['flavor'].apply(preprocess)

# Contar as frequências das palavras
word_freq_text = Counter(word for doc in tokenized_text for word in doc)

# Contar a coocorrência de pares de palavras
def count_coocurrences(corpus, window_size=1):
    coocurrence = Counter()
    for doc in corpus:
        for i, word in enumerate(doc):
            for j in range(i + 1, min(i + window_size + 1, len(doc))):
                coocurrence[(word, doc[j])] += 1
    return coocurrence

coocurrence_text = count_coocurrences(tokenized_text)

# Calcular o PMI para um par de palavras
def calculate_pmi(word1, word2, coocurrence, word_freq, corpus_size):
    coocurrence_count = coocurrence.get((word1, word2), 0)
    p_word1 = word_freq[word1] / corpus_size
    p_word2 = word_freq[word2] / corpus_size
    p_word1_word2 = coocurrence_count / corpus_size
    return np.log(p_word1_word2 / (p_word1 * p_word2)) if p_word1_word2 > 0 else 0

# Calculando o PMI para todos os pares de palavras
corpus_size_text = sum(word_freq_text.values())
pmi_results = {}

# Iterando por todos os pares únicos de palavras
for (word1, word2), count in coocurrence_text.items():
    pmi = calculate_pmi(word1, word2, coocurrence_text, word_freq_text, corpus_size_text)
    pmi_results[(word1, word2)] = pmi

# Exibir os resultados do PMI
for pair, pmi_value in pmi_results.items():
    print(f"PMI entre {pair[0]} e {pair[1]}: {pmi_value}")



PMI entre the e will: -1.1005841688956313
PMI entre will e of: -0.9444512403278928
PMI entre of e all: 1.1777715654176637
PMI entre all e by: 0.14653762033491635
PMI entre by e my: 1.4827154346730396
PMI entre my e hand: 2.8068400084909095
PMI entre hand e done: 3.7164417522142683
PMI entre every e tear: 3.8169829814361456
PMI entre tear e shed: 6.752090633173702
PMI entre shed e is: 1.5561467947842151
PMI entre is e a: 1.6909485665759898
PMI entre a e drop: 2.571457680241008
PMI entre drop e of: 2.524293524161231
PMI entre of e immortality: 1.8491648491039638
PMI entre only e the: 0.6681857821077251
PMI entre the e warrior: 0.3415015518526751
PMI entre warrior e who: 2.505989981790623
PMI entre who e can: 2.153985114660958
PMI entre can e admit: 3.2388582274457636
PMI entre admit e mortal: 6.041005751927089
PMI entre mortal e weakness: 5.59917299964805
PMI entre weakness e will: 2.805556910216193
PMI entre will e be: 3.1234527612644705
PMI entre be e bolstered: 5.2647436646566375
PMI 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

