# Preprocesamiento de datos

### Intento encontrar la codificacion del archivo

In [23]:
import chardet

# num_bytes = 10000
with open('biden_cash.txt', 'rb') as f:
    # como el archivo es demasiado grande se puede limitar el tamaño de lectura
    # rawdata = f.read(num_bytes)
    rawdata = f.read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

print(f'La codificación de caracteres detectada es: {encoding}')

La codificación de caracteres detectada es: MacRoman


### Convirtiendo la codificacion MacRoman a UTF-8

In [24]:
# Intentar cambiar la codificación de esta manera me genero más problemas, no recomendable
with open('biden_cash.txt', 'r', encoding='macroman') as input_file:
    text = input_file.read()

with open('biden_cash_enc.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text)

In [245]:
# Intento de decodificación
encodings = ['ascii', 'big5', 'big5hkscs', 'cp037', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'iso8859_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16']

with open('texto.txt', 'rb') as f:
    text = f.read()

for num, encoding in enumerate(encodings):
    try:
        decoded_text = text.decode(encoding)
        print(f"{num} Decodificando con {encoding}: {decoded_text}")
    except UnicodeDecodeError:
        print(f"{num} Fallo la codification con {encoding}")

0 Fallo la codification con ascii
1 Fallo la codification con big5
2 Fallo la codification con big5hkscs
3 Decodificando con cp037: àÍÊ/>À ÍÊShªBz%ÑÁ>
4 Fallo la codification con cp424
5 Decodificando con cp437: Durand AurΓêÜ┬⌐lien
6 Decodificando con cp500: àÍÊ/>À ÍÊShªBz%ÑÁ>
7 Decodificando con cp720: Durand Aurظêأ┬رlien
8 Decodificando con cp737: Durand AurέΙγ┬σlien
9 Decodificando con cp775: Durand AurŌłÜ┬®lien
10 Decodificando con cp850: Durand AurÔêÜ┬®lien
11 Decodificando con cp852: Durand AurÔłÜ┬ęlien
12 Decodificando con cp855: Durand AurРѕџ┬Еlien
13 Fallo la codification con cp856
14 Decodificando con cp857: Durand AurÔêÜ┬®lien
15 Decodificando con iso8859_1: Durand AurâÂ©lien
16 Decodificando con iso8859_2: Durand AurâÂŠlien
17 Decodificando con iso8859_3: Durand AurâÂİlien
18 Decodificando con iso8859_4: Durand AurâÂŠlien
19 Decodificando con iso8859_5: Durand AurтТЉlien
20 Fallo la codification con iso8859_6
21 Decodificando con iso8859_7: Durand AurβΒ©lien
2

### Ahora si a preprocesar

In [41]:
import csv

csv.field_size_limit(10000000)
input_file = open('biden_cash.txt', 'r', encoding='macroman')
# considero que es mejora cambiar la codificación aquí
output_file = open('biden_cash_clean.txt', 'w', encoding='utf-8')

In [42]:
reader = csv.reader(input_file, delimiter='|')
writer = csv.writer(output_file, delimiter='|')

num_cols = 25  # Número de columnas esperado en cada fila
for row in reader:
    if len(row) < num_cols:
        # Agregar valores faltantes
        row.extend([''] * (num_cols - len(row)))
    elif len(row) > num_cols:
        # Eliminar columnas adicionales
        row = row[:num_cols]
    writer.writerow(row)

input_file.close()
output_file.close()

## Procesando los datos

In [45]:
import pandas as pd

df = pd.read_csv('biden_cash_clean.txt', sep='|', header=None, on_bad_lines='skip', dtype={0: str, 2: str, 3: str, 17: str, 18: str, 19: str, 20: str, 21: str, 22: str, 23: str})

#### Buscando caracteres raros

In [401]:
# with open('sopa.txt', 'r', encoding='utf-8') as file:
with open('biden_cash_clean.txt', 'r', encoding='utf-8') as file:
    content = file.read()
    unique_chars = set(content)
    result = ''.join(unique_chars)

In [402]:
result

'\x14∏^ARÌèï\x15ÒVƒÆË:‚åöt?%W∑≤™Öøî8ºü\x0e(ÀyP§ÈÁ\x0b√π≠êãáÚn¶\\6[p\x0cªÕÅõ\x13“5≥¥ìùT\x16}G°ÏCxΩk ÑûD®€\uf8ff÷K<ÜÔﬁó\x1aO´Ã-ä\x02à⁄›fo’\x1e£a,E$|LZßvMwh\x11±7œ≈‰d…\xa0J\x01]â)FòmÎ»H`bçS„BQ\x1bs–©Ug\'•¢∞!Är\x03∂\x08+cñí~∫\x060éØqX\x04\x18†3&.ÍYÂ=Ç∆#u\x07ô\ti‹·\x19ﬂŒ{z◊>ú¨É”N14—2\nlµj"\x1fŸ‡æ9ë_*e/ÿÊI@‘¬'

In [409]:
import re
# result = re.sub(r'[a-zA-Z0-9\u0080-\u00FF]', '', result)
# result = re.sub(r'[a-zA-Z0-9]', '', result)
valores = r'[^\w\s\x01-\x20\uf8ff\\/\()\,\.\[\]\-]'

result = re.findall(valores, result)
result = ''.join(result)

In [410]:
result

'∏^:‚?%∑≤™§√≠¶“≥¥}°®€÷<´⁄›’£$|±≈‰…»`„–©\'•¢∞!∂+~∫†&=∆#‹·{◊>¨”—"‡*@‘¬'

In [295]:
# result = result.replace(' ', '')
# resultado = """ˆïš?º¿§¶ªÅ“¥°Ï¹€®ž«Ž´Ã¯’£±œ‰…\Î»`„³¦©¢²¼+¡¸¤†Â˜‹·Œ>”¨Šµ"Ÿ‡‘¬"""
# resultado



'ˆïš?º¿§¶ªÅ“¥°Ï¹€®ž«Ž´Ã¯’£±œ‰…\\Î»`„³¦©¢²¼+¡¸¤†Â˜‹·Œ>”¨Šµ"Ÿ‡‘¬'

In [411]:
regex = '[' + re.escape(result) + ']'

In [412]:
reemplazo = {'√©': 'é', '√°': 'á', '√¥': 'ô', '¬†': ' ', '√¢': 'â', '√£': 'ã', '©ú': 'Ü', '©ñ': 'Ö', '©≠': 'í', 'Ё®': 'ó'}
columnas = [3]

for columna in columnas:
    for c, v in reemplazo:
        df[columna] = df[columna].str.replace(c, v)

In [413]:
# matches = df[df[3].str.contains('Äò', na=False)]
# matches = df[df[3].str.len() > 110]
matches = df[df[3].str.contains(regex, na=False)]
# matches = df[~df[3].str.contains('[a-zA-Z0-9]', na=True)]

In [414]:
matches

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
313,5163830067877201,09/2023,941,Rodrigo Garcú≠a,BANCO SANTANDER S.A.,MASTERCARD,PLATINUM,DEBIT,Calle Orense 28,,...,Email,,,,,,,,,
522,4915663053072469,03/2023,007,Alexander Mart&iacutenez Cuin,"BANCO MERCANTIL DEL NORTE S.A.., INSTITUCION D...",VISA,ELECTRON,DEBIT,,,...,,,,,,,,,,
858,4152313464569537,12/2023,786,Edgar Omar Acu&ntildea,"BBVA BANCOMER, S.A.",VISA,CLASSIC,DEBIT,,,...,,,,,,,,,,
934,4152313492834200,04/2023,987,Briseida Guti&eacuterrez Moreno,"BBVA BANCOMER, S.A.",VISA,CLASSIC,DEBIT,,,...,,,,,,,,,,
991,4152313391733628,04/2023,118,Diana yaneth Guzm&aacuten,"BBVA BANCOMER, S.A.",VISA,CLASSIC,DEBIT,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087982,4908452706487004,11/2023,366,???e??? G??????,"PIRAEUS BANK, S.A.",VISA,CLASSIC,CREDIT,???a?? ?e???? 11,NO,...,,6946689607,,,,,,,,
2087991,5512382330495070,12/2023,294,Ana Luisa Gonz&aacutelez Mart&iacutenez,BANCO AZTECA S.A. INSTITUCION DE BANCA MULTIPLE,MASTERCARD,STANDARD,DEBIT,,MN,...,,,,,,,,,,
2088022,4222000358886327,11/2023,705,JoúÉ†£o Paulo Gomes,"ITAU UNIBANCO, S.A.",VISA,GOLD,CREDIT,Rua Conde Maur√É¬≠cio de Nassau,NO,...,,65996808507,,,,,,,,
2088120,6375290337814116,12/2023,941,AdúÉ†£o P S Neto,,MAESTRO,PERSONAL,DEBIT,12 de Outubro 1661 Cristo Rei,49,...,,,,,,,,,,


In [318]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,6226230733425407,12/2023,695,,CHINA MINSHENG BANK,CHINA UNION PAY,CLASSIC,CREDIT,,NO,...,,,,,,,,,,
1,6221550408456006,10/2023,399,,PINGAN BANK,CHINA UNION PAY,CLASSIC,CREDIT,,NO,...,,,,,,,,,,
2,4751290214696189,10/2023,769,Ilona Urbanska,NATIONAL WESTMINSTER BANK PLC,VISA,CLASSIC,DEBIT,3 Waldgrave Place,NO,...,,07305581769,,,,,,,,
3,4127525027387386,11/2023,360,Jaco Botha,"FIRSTRAND BANK, LTD.",VISA,GOLD,DEBIT,,,...,,,,,,,,,,
4,4430450055540321,08/23,344,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088251,6070936031539213,03/2023,432,NAAZNIZEROX,PUNJAB NATIONAL BANK,RUPAY,CLASSIC,DEBIT,NarodaINDIA,,...,,,,,,,,,,
2088252,4160852007761085,10/2023,363,Mariah Martinez,"RABOBANK, N.A.",VISA,CLASSIC,DEBIT,1101 concord ave,CA,...,,8053544851,,,,,,,,
2088253,5170239219091474,03/2023,480,,BOKIS,MASTERCARD,GOLD,DEBIT,,NO,...,,,,,,,,,,
2088254,5275190073194314,03/2023,974,Gari Motino,"BANK OF AMERICA, NATIONAL ASSOCIATION",MASTERCARD,ENHANCED,DEBIT,,GA,...,garysoriano7@hotmail.com,,,,,,,,,


In [415]:
diff = pd.merge(df, matches, how='outer', indicator=True)
diff = diff[diff['_merge'] == 'left_only'].drop('_merge', axis=1)

diff

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,6226230733425407,12/2023,695,,CHINA MINSHENG BANK,CHINA UNION PAY,CLASSIC,CREDIT,,NO,...,,,,,,,,,,
1,6221550408456006,10/2023,399,,PINGAN BANK,CHINA UNION PAY,CLASSIC,CREDIT,,NO,...,,,,,,,,,,
2,4751290214696189,10/2023,769,Ilona Urbanska,NATIONAL WESTMINSTER BANK PLC,VISA,CLASSIC,DEBIT,3 Waldgrave Place,NO,...,,07305581769,,,,,,,,
3,4127525027387386,11/2023,360,Jaco Botha,"FIRSTRAND BANK, LTD.",VISA,GOLD,DEBIT,,,...,,,,,,,,,,
4,4430450055540321,08/23,344,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088251,6070936031539213,03/2023,432,NAAZNIZEROX,PUNJAB NATIONAL BANK,RUPAY,CLASSIC,DEBIT,NarodaINDIA,,...,,,,,,,,,,
2088252,4160852007761085,10/2023,363,Mariah Martinez,"RABOBANK, N.A.",VISA,CLASSIC,DEBIT,1101 concord ave,CA,...,,8053544851,,,,,,,,
2088253,5170239219091474,03/2023,480,,BOKIS,MASTERCARD,GOLD,DEBIT,,NO,...,,,,,,,,,,
2088254,5275190073194314,03/2023,974,Gari Motino,"BANK OF AMERICA, NATIONAL ASSOCIATION",MASTERCARD,ENHANCED,DEBIT,,GA,...,garysoriano7@hotmail.com,,,,,,,,,


In [375]:
# import re
#
# text = '\x14∏^ARÌèï\x15ÒVƒÆË:‚åöt?%W∑≤™Öøî8ºü\x0e(ÀyP§ÈÁ\x0b√π≠êãáÚn¶\\6[p\x0cªÕÅõ\x13“5≥¥ìùT\x16}G°ÏCxΩk ÑûD®€\uf8ff÷K<ÜÔﬁó\x1aO´Ã-ä\x02à⁄›fo’\x1e£a,E$|LZßvMwh\x11±7œ≈‰d…\xa0J\x01]â)FòmÎ»H`bçS„BQ\x1bs–©Ug\'•¢∞!Är\x03∂\x08+cñí~∫\x060éØqX\x04\x18†3&.ÍYÂ=Ç∆#u\x07ô\ti‹·\x19ﬂŒ{z◊>ú¨É”N14—2\nlµj"\x1fŸ‡æ9ë_*e/ÿÊI@‘¬'
#
# valores = '[^\w\s\x01-\x20\uf8ff\\/\()\,\.]+'
#
# coincidencia = re.findall(valores, text)
# coincidencia = ''.join(coincidencia)
# coincidencia

'∏^:‚?%∑≤™§√≠¶\\[“≥¥}°®€÷<´-⁄›’£$|±≈‰…]»`„–©\'•¢∞!∂+~∫†&=∆#‹·{◊>¨”—"‡*@‘¬'