In [76]:
import dvc.api
import bson
import pandas as pd
import hvplot.pandas
import holoviews as hv
from collections import Counter

In [9]:
TEXT_COLUMNS = ["text_es", "text_en", "text_fr"]

In [2]:
with dvc.api.open(
    "data/raw/claims.bson",
    repo="https://github.com/Gonzalo933/newtral-interview",
    rev="data/v1",
    mode="rb",
) as fd:
    data = bson.decode_all(fd.read())
    df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,_id,text_es,text_en,text_fr,claim
0,5f80940cf95f926ca81a3751,Gracias.,Thank you.,Merci.,0
1,5f80940cf95f926ca81a3752,"Por ejemplo, cuando estamos hablando de un pa...","For example, when we are talking about a coun...","Par exemple, quand on parle d’un pays qui doi...",0
2,5f80940cf95f926ca81a3753,Entonces como solo creo que es como la políti...,So as I just think it's like the politics of ...,"Donc, comme je pense que c’est comme la polit...",0
3,5f80940cf95f926ca81a3754,Y pienso que el Partido Popular no ha estado ...,And I think the People's Party has not risen ...,Et je pense que le Parti populaire n’a pas ét...,0
4,5f80940cf95f926ca81a3755,Lo siguiente Nos vamos ya volando y les dejo ...,The next thing We fly and I leave you with th...,La prochaine chose que nous volons et je vous...,0


In [4]:
# number of samples per class
df.claim.value_counts()

0    13288
1     1065
Name: claim, dtype: int64

Se trata de un problema desbalanceado, por lo que habrá que tenerlo en cuenta a la hora de seleccionar métricas.

# Tamaño de los textos


In [28]:
# Average text length by lang.
df[TEXT_COLUMNS].apply(lambda cell: cell.str.len()).mean(0)

text_es    106.161778
text_en    100.598969
text_fr    110.569567
dtype: float64

In [34]:
# Average word count
df[TEXT_COLUMNS].apply(
    lambda cell: cell.str.split().map(lambda word_list: len(word_list))
).mean(0)

text_es    18.164217
text_en    17.868390
text_fr    18.422978
dtype: float64

In [56]:
plots = []
for col in TEXT_COLUMNS:
    plots.append(df[col].str.len().hvplot.hist(title=f"text length dist. for {col}"))
# Text length distribution
hv.Layout(plots).cols(1)

In [55]:
plots = []
for col in TEXT_COLUMNS:
    plots.append(
        df[col]
        .str.split()
        .map(lambda word_list: len(word_list))
        .hvplot.hist(title=f"word count dist. for {col}")
    )
# word count distribution
hv.Layout(plots).cols(1)

# Las 50 palabras más usadas en cada idioma

In [95]:
plots = []
for col in TEXT_COLUMNS:
    plots.append(df[col].str.lower().str.split(expand=True).stack().value_counts()[
        :50
    ].hvplot.bar().opts(xrotation=50, axiswise=True))
hv.Layout(plots).cols(1)

In [66]:
df[df["claim"] == 1]["text_es"].sample().values

array([' Es ese gran PP de los 10 millones de votos del año del año 2011.'],
      dtype=object)