# Explorando diferentes modelos de datos
<small> Adaptado de [Coursera](https://www.coursera.org/specializations/bigdata
)</small>

In [None]:
!wget https://github.com/words-sdsc/coursera/raw/master/big-data-2.zip
!unzip big-data-2

In [None]:
%cd ./big-data-2/

## Modelo de datos relacionales

In [None]:
import pandas as pd
df = pd.read_csv('csv/census.csv', sep=',')
print(f'Tamaño de la tabla: {df.shape}')
df.head()

In [None]:
print(f'Nombre de las columnas: {df.columns.values}')

In [None]:
# Población de la ciudad de Alabama
df.loc[df['CTYNAME'] == 'Alabama']['CENSUS2010POP']

In [None]:
# Promedio de habitantes en el estado de Alabama
df.loc[df['STNAME'] == 'Alabama']['CENSUS2010POP'].mean()

In [None]:
# Ciudades con una población menor a 10.000 habitantes
df.loc[df['CENSUS2010POP'] < 10000]['CTYNAME']

In [None]:
# Ciudades del estado de California con una población mayor a 1'000.000 habitantes
df.loc[(df['STNAME'] == 'California') & (df['CENSUS2010POP'] > 1000000)][['CTYNAME','CENSUS2010POP']]

## Modelo de datos semiestructurado - JSON

In [None]:
!head -n 1 json/twitter.json

In [None]:
%%script python json/json_schema.py json/twitter.json
--

In [None]:
%%script python json/print_json.py json/twitter.json 99 entities/hashtags
--

In [None]:
%%script python json/print_json.py json/twitter.json 50 user/location
--

## Modelo de datos 'Array' - Imágenes

In [None]:
from PIL import Image
im = Image.open('image/Australia.jpg')
im.show()

In [None]:
%%script python image/dimensions.py image/Australia.jpg
--

In [None]:
%%script python image/pixel.py image/Australia.jpg 100 100
--

## Modelo de datos 'Array' - Texto

Vectorización de textos usando _Term Frequency - Inverse Document Frequency (TF-IDF)_. Es una técnica que crea representaciones numéricas para documentos de texto a partir de la frecuencia de los términos en cada documento y la cantidad de documentos totales. De esta forma se pueden realizar búsquedas o comparaciones entre textos en base a su contenido textual.

**Ejemplo [freeCodeCamp](https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/)**

Oración 1 : The car is driven on the road.

Oración 2: The truck is driven on the highway.

<img src="tfidf.png" width=700 height=300 />

## Datos de sensores

In [None]:
!head -n 10 sensor/wx-data.txt

In [None]:
!cat sensor/wxt-520-format.txt

In [None]:
import sys
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from pytz import timezone

x = []
y = []

file = open('sensor/wx-data.txt', 'r')
for line in file:
    parts = re.split("\s+", line)
    
    data = parts[1].split(",")

    for field in data:
        match = re.match('Ta' + '=(\d+\.\d+).*', field)
        if match:
            timestamp = float(parts[0])
            x.append(timestamp)
            #time_parts = time.localtime(timestamp)
            y.append(float(match.group(1)))
    
file.close()

#fig, ax = plt.subplots()
fig = plt.figure()
ax = fig.add_subplot(111)

secs = mdate.date2num(x)*24*60*60

ax.plot_date(secs, y)

plt.xlabel('time')
plt.ylabel('Ta')

date_formatter = mdate.DateFormatter('%H:%M.%S', tz=timezone('US/Pacific'))
ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()

plt.show()

## Modelo de datos 'Grafos'

In [None]:
import pandas as pd
df = pd.read_csv('graph/diseaseGraph.csv', sep=',')
print(f'Tamaño de la tabla: {df.shape}')
df.head()

In [None]:
nodes = set(df.Source.iloc[:15])
edges = df.loc[(df.Source.isin(nodes)) | (df.Target.isin(nodes))].values

In [None]:
import networkx as nx
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [None]:
import matplotlib.pyplot as plt
nx.draw(G, with_labels=True)