In [326]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import find
import sys
import re

# Pre-processar csv

In [472]:
cards = pd.read_csv("./cards.csv")
urls = pd.read_csv("./CSV/CrawlerHeuristica.csv")

In [473]:
cards['mana'] = cards['mana'].apply(lambda mana: re.sub('[,\' \[\]]', '', 
                                                        mana.replace('White', 'w').replace('Blue', 'u')
                                        .replace('Green', 'g').replace('Red', 'r').replace('Black', 'b')
                                        .replace('Mana', '').replace('or', '/')).lower()
                    if type(mana) == type('') else '')

In [474]:
cards['desc'] = cards['desc'].apply(lambda d: d.strip() if type(d) == type('') else '')
cards['name'] = cards['name'].apply(lambda n: n.strip() if type(n) == type('') else '')
cards['type'] = cards['type'].apply(lambda t: t.strip() if type(t) == type('') else '')

In [475]:
cards['rarity'] = cards['rarity'].apply(lambda r: 'Rare' if r == 'R' else 'Common' if r == 'C' else
                                        'Uncommon' if r == 'U' else 'Mythic' if r == 'M' else 
                                          r if type(r) == type('') else '')

In [500]:
def get_link_by_tuple(site, idx):
    return list(urls[(urls['Número'] == int(idx)) & (urls['Site'] == site)]['Link'])[0]
cards['url'] = cards['url'].apply(lambda u: get_link_by_tuple(u.split('/')[3].split('.com')[0] + '.com', u.split('/')[3].split('.com')[1].split('.html')[0]))

# Start from Here

In [501]:
cards.to_csv('final_cards.csv')

In [502]:
cards = pd.read_csv("./final_cards.csv")

In [7]:
count_vect = CountVectorizer(analyzer = "word",
                                 tokenizer = None,    
                                 preprocessor = None,
                                 stop_words = None)

In [51]:
docs = cards['name']

In [64]:
words = count_vect.fit_transform(docs)

In [65]:
features = count_vect.get_feature_names()

In [89]:
row, column, values = find(words)
terms = list(zip(row, column, values))

In [94]:
idf = {}
for feature in features:
    idf[feature] = 0
for (_, c, v) in terms:
    idf[features[c]] = idf[features[c]] + v

In [286]:
def tf_function(lr, lf, lv, r, f, v):
    return [(r, v)]
def tf_idf_function(lr, lf, lv, r, f, v):
    return [(r, v/idf[f])]
def tf_compressed_function(lr, lf, lv, r, f, v):
    return [(r-lr, v)] if lf == f else [(r, v)]
def tf_idf_compressed_function(lr, lf, lv, r, f, v):
    return [(r-lr, v/idf[f])] if lf == f else [(r, v/idf[f])]

In [292]:
def create_inverse_index(function):
    inverse_index = {}

    lr = None
    lf = None
    lv = None
    for (r, c, v) in terms:
        f = features[c]
        value = function( lr, lf, lv, r, f, v )
        if f in inverse_index:
            inverse_index[f] = inverse_index[f] + value
        else:
            inverse_index[f] = value
        lr = r
        lf = f
        lv = v
    return inverse_index

        
inverse_index_tf = create_inverse_index(tf_function)
inverse_index_tf_idf = create_inverse_index(tf_idf_function)
inverse_index_tf_compressed = create_inverse_index(tf_compressed_function)
inverse_index_tf_idf_compressed = create_inverse_index(tf_idf_compressed_function)

In [293]:
inverse_index_tf_compressed

{'10th': [(901, 1), (2672, 1), (213, 1), (844, 1)],
 '2007': [(3677, 1)],
 '2008': [(3556, 1)],
 '2010': [(2965, 1), (1639, 1)],
 '2011': [(116, 1),
  (108, 1),
  (89, 1),
  (970, 1),
  (284, 1),
  (1946, 1),
  (367, 1),
  (485, 1)],
 '2012': [(916, 1),
  (67, 1),
  (1574, 1),
  (208, 1),
  (561, 1),
  (649, 1),
  (302, 1),
  (46, 1),
  (468, 1)],
 '2013': [(253, 1),
  (237, 1),
  (88, 1),
  (35, 1),
  (117, 1),
  (127, 1),
  (461, 1),
  (264, 1),
  (30, 1),
  (2, 1),
  (131, 1),
  (542, 1),
  (67, 1),
  (333, 1),
  (11, 1),
  (122, 1),
  (548, 1),
  (54, 1),
  (824, 1),
  (71, 1),
  (11, 1)],
 '2014': [(39, 1),
  (209, 1),
  (78, 1),
  (333, 1),
  (6, 1),
  (75, 1),
  (169, 1),
  (97, 1),
  (14, 1),
  (241, 1),
  (823, 1),
  (78, 1),
  (227, 1),
  (737, 1),
  (64, 1),
  (227, 1),
  (103, 1),
  (211, 1),
  (307, 1),
  (274, 1)],
 '2015': [(335, 1),
  (48, 1),
  (423, 1),
  (19, 1),
  (44, 1),
  (106, 1),
  (201, 1),
  (1528, 1),
  (57, 1),
  (713, 1),
  (10, 1),
  (446, 1),
  (333, 1)]

In [268]:
def int_to_vb_code(value):
    if value < (1 << 7):
        return bytearray([value + (1 << 7)])
    x = 7
    while((value>>(x+7)) > 0):
        x = x + 7
    return bytearray([value>>x]) + vb_code(value-((value>>x)<<x)) 

def vb_code_to_int(bytesarray):
    size = len(bytesarray)
    total = 0
    for (b, i) in [(bytesarray[i], i + 1) for i in range(len(bytesarray))]:
        total = total + (b << (7 * (size - i)))
    return total - (1 << 7)

In [301]:
def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

In [300]:
def tf_compressed(lr, lf, lv, r, f, v):
    return int_to_vb_code(r)
get_size(create_inverse_index(tf_function)) - get_size(create_inverse_index(tf_compressed))

1992833

In [302]:
compressed = create_inverse_index(tf_compressed)

In [322]:
def vb_code_to_int_array(bytearrays):
    array = []
    prev = 0
    cur = bytearray()
    for b in bytearrays:
        cur = cur + bytes([b])
        if b & (1 << 7):
            value = vb_code_to_int(cur)
            array.append(value - prev)
            cur = bytearray()
            prev = value
    return array

[901, 2672, 213, 844]

In [323]:
vb_code_to_int_array(compressed['10th'])

[901, 2672, 213, 844]