# Preprocesamiento de datos

### Intento encontrar la codificacion del archivo

In [None]:
import chardet

# num_bytes = 10000
with open('biden_cash.txt', 'rb') as f:
    # como el archivo es demasiado grande se puede limitar el tamaño de lectura
    # rawdata = f.read(num_bytes)
    rawdata = f.read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

print(f'La codificación de caracteres detectada es: {encoding}')

### Convirtiendo la codificacion MacRoman a UTF-8

In [None]:
# Intentar cambiar la codificación de esta manera me genero más problemas, no recomendable
with open('biden_cash.txt', 'r', encoding='macroman') as input_file:
    text = input_file.read()

with open('biden_cash_enc.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text)

In [None]:
# Intento de decodificación
encodings = ['ascii', 'big5', 'big5hkscs', 'cp037', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'iso8859_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16']

with open('texto.txt', 'rb') as f:
    text = f.read()

for num, encoding in enumerate(encodings):
    try:
        decoded_text = text.decode(encoding)
        print(f"{num} Decodificando con {encoding}: {decoded_text}")
    except UnicodeDecodeError:
        print(f"{num} Fallo la codification con {encoding}")

### Ahora si a preprocesar

In [None]:
import csv

csv.field_size_limit(10000000)
input_file = open('biden_cash.txt', 'r', encoding='macroman')
# considero que es mejora cambiar la codificación aquí
output_file = open('biden_cash_clean.txt', 'w', encoding='utf-8')

In [None]:
reader = csv.reader(input_file, delimiter='|')
writer = csv.writer(output_file, delimiter='|')

num_cols = 25  # Número de columnas esperado en cada fila
for row in reader:
    if len(row) < num_cols:
        # Agregar valores faltantes
        row.extend([''] * (num_cols - len(row)))
    elif len(row) > num_cols:
        # Eliminar columnas adicionales
        row = row[:num_cols]
    writer.writerow(row)

input_file.close()
output_file.close()

## Procesando los datos

In [None]:
import pandas as pd

df = pd.read_csv('biden_cash_clean.txt', sep='|', header=None, on_bad_lines='skip', dtype={0: str, 2: str, 3: str, 17: str, 18: str, 19: str, 20: str, 21: str, 22: str, 23: str})

#### Buscando caracteres raros

In [None]:
# with open('sopa.txt', 'r', encoding='utf-8') as file:
with open('biden_cash_clean.txt', 'r', encoding='utf-8') as file:
    content = file.read()
    unique_chars = set(content)
    result = ''.join(unique_chars)

In [None]:
result

In [None]:
import re
# result = re.sub(r'[a-zA-Z0-9\u0080-\u00FF]', '', result)
# result = re.sub(r'[a-zA-Z0-9]', '', result)
valores = r'[^\w\s\x01-\x20\uf8ff\\/\()\,\.\[\]\-]'

result = re.findall(valores, result)
result = ''.join(result)

In [None]:
result

In [None]:
# result = result.replace(' ', '')
# resultado = """ˆïš?º¿§¶ªÅ“¥°Ï¹€®ž«Ž´Ã¯’£±œ‰…\Î»`„³¦©¢²¼+¡¸¤†Â˜‹·Œ>”¨Šµ"Ÿ‡‘¬"""
# resultado



In [None]:
regex = '[' + re.escape(result) + ']'

In [None]:
reemplazo = {'√©': 'é', '√°': 'á', '√¥': 'ô', '¬†': ' ', '√¢': 'â', '√£': 'ã', '©ú': 'Ü', '©ñ': 'Ö', '©≠': 'í', 'Ё®': 'ó'}
columnas = [3]

for columna in columnas:
    for c, v in reemplazo:
        df[columna] = df[columna].str.replace(c, v)

In [None]:
# matches = df[df[3].str.contains('Äò', na=False)]
# matches = df[df[3].str.len() > 110]
matches = df[df[3].str.contains(regex, na=False)]
# matches = df[~df[3].str.contains('[a-zA-Z0-9]', na=True)]

In [None]:
matches

In [None]:
df

In [None]:
diff = pd.merge(df, matches, how='outer', indicator=True)
diff = diff[diff['_merge'] == 'left_only'].drop('_merge', axis=1)

diff

In [None]:
# import re
#
# text = '\x14∏^ARÌèï\x15ÒVƒÆË:‚åöt?%W∑≤™Öøî8ºü\x0e(ÀyP§ÈÁ\x0b√π≠êãáÚn¶\\6[p\x0cªÕÅõ\x13“5≥¥ìùT\x16}G°ÏCxΩk ÑûD®€\uf8ff÷K<ÜÔﬁó\x1aO´Ã-ä\x02à⁄›fo’\x1e£a,E$|LZßvMwh\x11±7œ≈‰d…\xa0J\x01]â)FòmÎ»H`bçS„BQ\x1bs–©Ug\'•¢∞!Är\x03∂\x08+cñí~∫\x060éØqX\x04\x18†3&.ÍYÂ=Ç∆#u\x07ô\ti‹·\x19ﬂŒ{z◊>ú¨É”N14—2\nlµj"\x1fŸ‡æ9ë_*e/ÿÊI@‘¬'
#
# valores = '[^\w\s\x01-\x20\uf8ff\\/\()\,\.]+'
#
# coincidencia = re.findall(valores, text)
# coincidencia = ''.join(coincidencia)
# coincidencia