# Análisis de WhatsApps

##  Preparación de ambiente

### Carga de módulos

In [2]:
# Data Wrangling
import numpy as np
import pandas as pd

# Data Visualization
import cufflinks as cf

# Text Mining
import re
import unicodedata
from nltk.corpus import stopwords

# Enviroment setup
cf.go_offline()

### Conexión con Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Funciones relevantes

In [4]:
def clean_text(text, pattern="[^a-zA-Z0-9 ]"):
    """Cleans the text to facilitate its analysis, it changes accented letters to its equivalent without accents, removes also special characters

    Parameters
    ----------
    text : string
        String containing text
    pattern : str, optional
        Regular expession for keeping in the string, by default ``[^a-zA-Z0-9 ]``

    Returns
    -------
    cleaned_text : string
        Cleaned string

    Example
    -------
    >>> clean_text('¡Feliz año nuevo, México!')
    >>> u'feliz ano nuevo mexico'
    """
    cleaned_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern, " ", cleaned_text.decode("utf-8"), flags=re.UNICODE)
    cleaned_text = u' '.join(cleaned_text.lower().split())
    return cleaned_text

In [5]:
clean_text('¡Feliz año nuevo, México!')

'feliz ano nuevo mexico'

## Carga de datos

In [6]:
with open("/content/drive/MyDrive/_chat.txt") as file:
    text = file.readlines()

In [7]:
text[:10]

['[23/06/17 4:55:01 p.m.] Davo Acosta: Ooooo\n',
 '[23/06/17 4:55:11 p.m.] Davo Acosta: Entonces mañana llega\n',
 '[23/06/17 5:01:32 p.m.] Oscar Acosta: ¿Crees?\n',
 '[23/06/17 5:02:19 p.m.] Oscar Acosta: Yo digo que hasta el lunes\n',
 '[23/06/17 5:02:25 p.m.] Oscar Acosta: Porque se hacen bien weyes\n',
 '[23/06/17 5:02:33 p.m.] Oscar Acosta: Y creo no trabajan en fines de semana\n',
 '[23/06/17 5:02:49 p.m.] Davo Acosta: A entonces hasta el viernes\n',
 '[23/06/17 5:02:58 p.m.] Oscar Acosta: 🤦🏻\u200d♂\n',
 '[23/06/17 5:03:00 p.m.] Oscar Acosta: Ash\n',
 '[23/06/17 5:03:03 p.m.] Oscar Acosta: Me chocas\n']

In [8]:
df = pd.DataFrame(text)

In [9]:
df.head()

Unnamed: 0,0
0,[23/06/17 4:55:01 p.m.] Davo Acosta: Ooooo\n
1,[23/06/17 4:55:11 p.m.] Davo Acosta: Entonces ...
2,[23/06/17 5:01:32 p.m.] Oscar Acosta: ¿Crees?\n
3,[23/06/17 5:02:19 p.m.] Oscar Acosta: Yo digo ...
4,[23/06/17 5:02:25 p.m.] Oscar Acosta: Porque s...


In [20]:
df["fecha"] = df[0].str.split("[").str[1].str.split(" ").str[0]

In [21]:
df["fecha"]

0        23/06/17
1        23/06/17
2        23/06/17
3        23/06/17
4        23/06/17
           ...   
15517    27/09/22
15518         NaN
15519    27/09/22
15520    27/09/22
15521    27/09/22
Name: fecha, Length: 15522, dtype: object

In [36]:
df["hora"] = df[0].str.split(" ").str[1:3].str.join(" ").str.split("]").str[0]

In [37]:
df["hora"]

0        4:55:01 p.m.
1        4:55:11 p.m.
2        5:01:32 p.m.
3        5:02:19 p.m.
4        5:02:25 p.m.
             ...     
15517    9:16:40 a.m.
15518                
15519    9:16:55 a.m.
15520    9:17:03 a.m.
15521    9:33:02 a.m.
Name: hora, Length: 15522, dtype: object

In [44]:
df["autor"] = df[0].str.split("] ").str[1].str.split(": ").str[0]

In [45]:
df["autor"]

0         Davo Acosta
1         Davo Acosta
2        Oscar Acosta
3        Oscar Acosta
4        Oscar Acosta
             ...     
15517    Oscar Acosta
15518             NaN
15519     Davo Acosta
15520     Davo Acosta
15521    Oscar Acosta
Name: autor, Length: 15522, dtype: object

In [48]:
df["mensaje"] = df[0].str.split(": ").str[1]

In [49]:
df["mensaje"]

0                                Ooooo\n
1                Entonces mañana llega\n
2                              ¿Crees?\n
3           Yo digo que hasta el lunes\n
4           Porque se hacen bien weyes\n
                      ...               
15517    Creo que es lo que me decías:\n
15518                                NaN
15519                            Simón\n
15520                 ‎sticker omitted\n
15521                 ‎sticker omitted\n
Name: mensaje, Length: 15522, dtype: object

In [50]:
df = df[df["mensaje"].str.len()>0]

In [56]:
df.sample(5)

Unnamed: 0,fecha,hora,autor,mensaje
7433,20/04/19,9:39:02 p.m.,Oscar Acosta,‎sticker omitted\n
12204,04/02/20,1:43:36 p.m.,Oscar Acosta,Ya nada\n
14073,09/04/21,2:27:07 p.m.,Oscar Acosta,Pues es medio tiempo\n
588,08/08/17,4:33:01 p.m.,Oscar Acosta,¿Mucha?\n
3381,14/05/18,5:02:37 p.m.,Davo Acosta,Estaba cerrado\n


In [52]:
df = df.drop(columns=[0])

In [53]:
muestra = df.sample(300, random_state=123)

In [54]:
muestra.to_csv("muestra_wa.csv", sep=";", index=False)

In [55]:
df.to_csv("whats.csv", sep="\t", index=False)