In [1]:
import pandas as pd
import nltk

In [2]:
dados = pd.read_csv('data/noticias_estadao.csv')

# Join do conteúdo
Juntando os títulos das notícias com seus respectivos conteúdos,
para posteriomente facilitar a tokenização

In [3]:
materias = dados.titulo + " " + dados.conteudo
ids = dados.idNoticia

# Tokenizando conteúdo
Criando tokens com cada palavra do texto para que posteriormente possam ser indexadas e associadas aos respectivos ids das notícias

In [4]:
tokens = materias.apply(nltk.word_tokenize)

# Indexando tokens
Criando indices invertidos com os tokens para poder aplicar os métodos de busca 

In [5]:
index = {}

for i in range(len(tokens)):
    id_noticia = ids[i]
    palavras = tokens[i]
    for palavra in palavras:
        palavra = palavra.lower()
        if palavra not in index:
            index[palavra] = [id_noticia]
        elif id_noticia not in index[palavra]:
            index[palavra].append(id_noticia)

In [22]:
print(len(index['debate']))

955


# Busca palavra mais próxima
Método que busca a palavra no indice

In [6]:
def busca_proximidade(termo):
    termos = index.keys()
    return sorted(termos, key=lambda palavra: nltk.edit_distance(termo, palavra))[0]

busca_proximidade('campine')
    

'camping'

# Busca AND
Método de busca conjuntiva onde todos os termos pesquisados devem estar presentes na notícia

In [7]:
def busca_and(*termos):
    intersection = set(index[termos[0].lower()])
    for termo in termos:
        ids = set(index[termo.lower()])
        intersection = intersection.intersection(ids)
    return list(intersection)

# Busca OR
Método de busca disjuntiva onde pelo menos um dos termos deve estar presente na notícia

In [8]:
def busca_or(*termos):
    union = set(index[termos[0].lower()])
    for termo in termos:
        ids = set(index[termo.lower()])
        union = union.union(ids)
    return list(union)

# Busca genérica
Recebe uma sentença em formato de string e aplica as operações passadas na string

In [9]:
def search(sentece):
    expressao_tokenizada = sentece.split(" ")
    operacao = expressao_tokenizada[1] if len(expressao_tokenizada) >= 2 else "AND"
    termos = []
    for index in range(0, len(expressao_tokenizada), 2):
        termos.append(expressao_tokenizada[index])
    
    return {
        "AND": busca_and,
        "OR": busca_or
    }[operacao](*termos)

# Busca Campina, Grande (AND)

In [10]:
assert(len(search('Campina AND Grande'))) == 12
print(search('Campina AND Grande'))

[1952, 4802, 1987, 6694, 5382, 1770, 2763, 1068, 5870, 2777, 1370, 2779]


# Busca pela palavra 'candidatos'

In [11]:
assert(len(search("candidatos"))) == 1395
print(search("candidatos"))

[51, 73, 77, 84, 123, 127, 142, 143, 154, 161, 167, 189, 191, 263, 268, 276, 311, 343, 374, 375, 377, 383, 399, 417, 418, 444, 483, 484, 604, 605, 616, 623, 628, 632, 633, 640, 647, 651, 662, 688, 717, 718, 746, 747, 752, 761, 764, 777, 783, 790, 791, 792, 793, 795, 796, 801, 804, 813, 825, 827, 828, 829, 844, 859, 870, 871, 876, 890, 982, 985, 1005, 1011, 1014, 1016, 1019, 1020, 1034, 1040, 1050, 1053, 1058, 1059, 1068, 1075, 1077, 1086, 1091, 1096, 1097, 1099, 1103, 1106, 1108, 1112, 1113, 1115, 1116, 1117, 1119, 1123, 1124, 1128, 1131, 1134, 1140, 1143, 1148, 1158, 1162, 1163, 1165, 1166, 1168, 1170, 1172, 1174, 1175, 1178, 1186, 1189, 1191, 1193, 1194, 1195, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1208, 1209, 1210, 1213, 1214, 1218, 1228, 1229, 1230, 1234, 1236, 1237, 1244, 1248, 1251, 1257, 1265, 1267, 1269, 1274, 1277, 1278, 1292, 1293, 1295, 1306, 1307, 1310, 1315, 1317, 1324, 1327, 1330, 1337, 1338, 1339, 1343, 1344, 1348, 1349, 1355, 1356, 1359, 1364, 1371, 1372, 1374, 1380,

# Busca por debate, presidenciável (OR)

In [12]:
assert len(search("debate OR presidencial")) == 1770
print(search("debate OR presidencial"))

[1, 2, 4099, 4103, 4109, 16, 4113, 4119, 24, 4126, 4130, 4131, 4134, 46, 50, 4155, 61, 4158, 4161, 4166, 4169, 4170, 79, 86, 88, 4197, 4198, 105, 107, 109, 4207, 4215, 4216, 122, 4223, 130, 4234, 140, 4236, 143, 4243, 4246, 158, 165, 166, 4261, 4265, 4267, 4274, 4276, 182, 4278, 4279, 189, 4286, 4290, 4294, 199, 201, 203, 204, 205, 209, 213, 4316, 4318, 4319, 228, 234, 4331, 235, 238, 240, 241, 244, 4341, 250, 4348, 4350, 255, 4354, 259, 260, 4355, 261, 4360, 4362, 4364, 273, 4373, 278, 4383, 295, 4395, 4399, 4407, 4408, 311, 4411, 4413, 4415, 4416, 4418, 327, 4427, 334, 336, 4433, 4437, 343, 345, 4442, 348, 353, 355, 357, 363, 4460, 4459, 368, 369, 4466, 4465, 4467, 4468, 4470, 374, 378, 4474, 4479, 4480, 383, 4484, 389, 392, 396, 399, 400, 4504, 4505, 4508, 413, 4511, 416, 4513, 417, 419, 4516, 4517, 4518, 4520, 4521, 4525, 4530, 4547, 4548, 4549, 453, 452, 4555, 461, 4558, 463, 4563, 4564, 471, 472, 475, 478, 484, 487, 4584, 491, 4589, 4593, 4606, 4611, 4615, 4617, 4619, 4620, 4625,

# Busca debate, presidenciável (AND)

In [13]:
assert len(search("debate AND presidencial")) == 201
print(search("debate AND presidencial"))

[2054, 4615, 3592, 4619, 2574, 6670, 1038, 1043, 2068, 2069, 24, 4635, 4636, 3611, 1058, 1571, 4134, 7208, 6185, 2092, 1069, 4654, 2608, 7217, 1586, 1588, 3127, 6713, 6202, 2107, 2108, 5181, 7232, 1600, 1102, 1111, 4706, 3689, 1131, 1132, 2669, 1647, 1138, 1140, 5748, 1658, 637, 1151, 2176, 1155, 3206, 1158, 2184, 1672, 2705, 3219, 2711, 1180, 2211, 3235, 4261, 166, 4775, 3236, 3242, 1197, 2224, 5299, 3251, 3252, 6331, 5307, 703, 704, 3268, 713, 4814, 2255, 3792, 3793, 722, 2774, 4823, 2266, 5338, 3805, 4318, 3806, 3807, 3811, 1767, 6377, 234, 6892, 3830, 3322, 255, 7425, 3844, 776, 2313, 779, 6412, 1291, 6931, 1811, 4885, 3860, 1816, 1313, 2338, 1315, 3874, 805, 2853, 3367, 1320, 1325, 814, 1326, 3887, 4913, 5937, 1844, 5941, 5942, 1845, 6969, 1339, 5951, 2880, 1855, 6978, 1348, 327, 1864, 6987, 3916, 7505, 1362, 1873, 2388, 1367, 4442, 7004, 1374, 2911, 1379, 5479, 5480, 4460, 5996, 1391, 1394, 883, 1399, 6521, 3450, 6523, 1404, 3454, 1406, 6015, 6017, 4994, 3469, 2449, 1939, 4504, 4

# Busca presidenciáveis, corruptos (OR)

In [14]:
assert len(search("presidenciáveis OR corruptos")) == 164
print(search("presidenciáveis OR corruptos"))

[5121, 1537, 3843, 2051, 5895, 777, 523, 2571, 5133, 2574, 272, 7441, 5906, 6931, 2068, 2069, 3860, 789, 3607, 538, 5659, 539, 2080, 3616, 2088, 4137, 2860, 1325, 1326, 2093, 304, 3633, 3377, 2609, 4660, 1847, 3641, 1851, 4926, 3391, 1343, 1859, 2628, 4677, 3397, 68, 841, 4170, 2123, 330, 2125, 3664, 4177, 5713, 3666, 1109, 4184, 93, 2144, 5472, 3684, 2149, 2660, 4199, 2152, 1129, 4965, 1639, 874, 2669, 3182, 2672, 2676, 5237, 3188, 3444, 3446, 3447, 375, 4219, 3962, 893, 2686, 2678, 126, 6785, 6783, 1158, 4743, 4488, 2442, 3466, 4492, 4235, 3217, 3730, 1428, 149, 3479, 5271, 6809, 2458, 6043, 6813, 1693, 160, 3489, 4258, 3747, 4260, 422, 2471, 426, 3243, 2732, 1198, 430, 3248, 2225, 176, 2740, 1461, 180, 3767, 4025, 3260, 1470, 6079, 4034, 4294, 2248, 1481, 3786, 4042, 2764, 2253, 2507, 7630, 456, 7369, 1487, 5587, 2516, 7635, 7641, 7642, 3546, 6112, 3042, 6115, 3046, 2023, 2024, 2026, 2028, 2285, 2030, 4847, 497, 6131, 5110, 5115, 2813, 6910]


# Busca presidenciáveis, corruptos (AND)

In [15]:
assert len(search("presidenciáveis AND corruptos")) == 0
print(search("presidenciáveis AND corruptos"))

[]


# Busca Belo, Horizonte (OR)

In [16]:
assert len(search("Belo OR Horizonte")) == 331
print(search("Belo OR Horizonte"))

[4100, 4, 1033, 2061, 13, 2063, 2064, 2065, 6160, 4115, 4116, 5133, 1046, 1044, 2074, 6174, 3109, 3110, 2044, 3119, 48, 2095, 2098, 2101, 3129, 1081, 1086, 3135, 1088, 1087, 3138, 1090, 3142, 3143, 1094, 74, 75, 2127, 4177, 5205, 6231, 5208, 4184, 3160, 6240, 2145, 3170, 1133, 3183, 4209, 3186, 3188, 2164, 1142, 5242, 5247, 5250, 4230, 4234, 3214, 4241, 4245, 3224, 3225, 4251, 1180, 1183, 2207, 1186, 2213, 6312, 2219, 6316, 6317, 6140, 3248, 4273, 2233, 3262, 196, 4294, 206, 1230, 6354, 211, 3284, 3288, 1242, 5341, 7396, 4333, 1264, 3316, 5364, 1270, 3326, 1280, 4357, 4361, 4372, 4375, 7451, 7453, 5407, 5410, 1315, 7460, 1318, 7463, 5417, 4398, 304, 5425, 1329, 1331, 2362, 2363, 2365, 1343, 332, 2381, 5456, 1362, 3413, 4437, 1367, 2395, 7519, 5476, 1381, 4454, 6500, 5477, 1386, 1388, 1389, 6512, 3445, 374, 7545, 1405, 1406, 7552, 7554, 6535, 5514, 7566, 1424, 7569, 4498, 1431, 1434, 5535, 1445, 5541, 3506, 2483, 5559, 6587, 2494, 6591, 3522, 7619, 5574, 3528, 3531, 3534, 4559, 5583, 55

# Busca Belo, Horizonte (AND)

In [17]:
assert len(search("Belo AND Horizonte")) == 242
print(search("Belo AND Horizonte"))

[4, 1033, 3593, 13, 2061, 2063, 2064, 2065, 6160, 4115, 3604, 4116, 1046, 2074, 6174, 5665, 4646, 4649, 3626, 2044, 3119, 48, 2095, 2098, 2101, 3129, 1081, 1598, 1086, 3135, 1088, 3138, 1603, 1090, 1605, 3142, 3143, 1094, 5702, 5706, 74, 75, 2127, 4177, 3069, 5205, 6231, 5208, 4184, 3160, 6240, 2145, 4710, 1133, 3183, 624, 2672, 2673, 3698, 3186, 3188, 2164, 4720, 1142, 3700, 5242, 4723, 634, 4604, 4230, 6794, 4234, 4241, 4245, 3224, 4251, 1180, 4765, 1186, 4771, 2213, 6312, 2219, 4780, 6316, 4273, 4787, 4797, 3262, 1734, 4294, 206, 1230, 6354, 1242, 1762, 5133, 7396, 4333, 2798, 1264, 6897, 4850, 3316, 6900, 1270, 3326, 1280, 1797, 1798, 4357, 4361, 3852, 3860, 4375, 7451, 7453, 4893, 5407, 5919, 1823, 5410, 1315, 7460, 7463, 5417, 4398, 5425, 1329, 1331, 5946, 1343, 1858, 1861, 4937, 3915, 332, 4939, 2381, 4941, 1872, 1362, 3923, 1877, 3926, 3413, 1367, 5976, 5977, 4437, 1880, 2395, 2909, 1087, 7519, 1883, 4958, 4957, 1892, 1381, 4454, 1386, 1388, 1389, 5975, 6512, 1909, 3445, 7032, 

# Busca por mais de dois termos - campina, grande, candidatos (AND)

In [18]:
assert(len(search("campina AND grande AND candidatos"))) == 2
print(search("campina AND grande AND candidatos"))

[1068, 6694]


# Busca por mais de dois termos - campina, grande, candidatos (OR)

In [19]:
assert(len(search("campina OR grande OR candidatos"))) == 2664
print(search("campina OR grande OR candidatos"))

[1, 2, 4, 9, 17, 18, 22, 28, 30, 33, 37, 38, 41, 43, 46, 47, 50, 51, 56, 61, 64, 68, 73, 76, 77, 79, 80, 81, 84, 85, 90, 91, 94, 95, 98, 106, 108, 116, 123, 125, 127, 130, 138, 139, 142, 143, 145, 154, 161, 167, 170, 176, 180, 182, 187, 188, 189, 191, 195, 197, 198, 199, 200, 208, 214, 215, 218, 221, 225, 235, 240, 248, 253, 259, 263, 266, 268, 274, 276, 305, 311, 312, 313, 318, 326, 327, 328, 343, 352, 374, 375, 377, 383, 396, 399, 417, 418, 421, 425, 443, 444, 446, 450, 455, 460, 463, 467, 481, 483, 484, 489, 497, 500, 501, 511, 514, 518, 525, 527, 529, 530, 546, 549, 553, 565, 570, 578, 592, 593, 600, 604, 605, 612, 616, 622, 623, 624, 628, 632, 633, 637, 639, 640, 643, 647, 651, 658, 660, 662, 664, 676, 680, 687, 688, 696, 705, 713, 714, 717, 718, 719, 720, 722, 725, 727, 740, 746, 747, 752, 755, 757, 761, 764, 772, 777, 783, 788, 790, 791, 792, 793, 795, 796, 797, 799, 801, 802, 804, 813, 814, 819, 822, 825, 827, 828, 829, 834, 836, 838, 841, 844, 848, 850, 851, 852, 857, 859, 861

# Busca por mais de dois termos - debate, presidencial, presidenciáveis (AND)

In [20]:
assert(len(search("debate AND presidencial AND presidenciáveis"))) == 10
print(search("debate AND presidencial AND presidenciáveis"))

[2026, 2669, 2574, 1325, 1326, 6931, 2068, 2069, 3860, 7641]
