In [1]:
# -*- coding: utf-8 -*-
import sys
import os
import unicodedata
import string
import urllib2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Agrego al path la carpeta modulos
sys.path.insert(0, os.path.abspath("../../modulos"))
from LexAnalyser import LexAnalyser

In [2]:
url = 'http://www.gutenberg.org/cache/epub/2000/pg2000.txt'
response = urllib2.urlopen(url)
raw = response.read()

In [3]:
start = raw.find('El ingenioso hidalgo don Quijote de la Mancha')
end = raw.rfind('End of Project Gutenberg')
raw = raw[start:end]
print len(raw)

2178990


In [5]:
vocabulario = {}
la = LexAnalyser({}) #diccionario vacio: sin stem, ni stopwords, ni regex, ni long de terminos, etc 
terms = la.analyse(raw.decode("UTF-8"))["terms"]
for t in terms:
    if t not in vocabulario:
        vocabulario[t] = 1
    else:
        vocabulario[t] += 1
print [{key: val} for key, val in vocabulario.items()[:10]]

[{u'consuelome': 1}, {u'canes': 3}, {u'igual': 16}, {u'indignacion': 2}, {u'hermana': 11}, {u'hermano': 106}, {u'gastara': 1}, {u'acababamos': 1}, {u'dictado': 1}, {u'compuesta': 11}]


In [6]:
df = pd.DataFrame(index = vocabulario.keys(),
                  columns = ['Frecuencia','Ranking','C=F*R'],
                  dtype = 'float64')
for a in df.index:
    df.loc[a,'Frecuencia'] = vocabulario[a]
df['Ranking'] = df['Frecuencia'].rank(method = 'min', ascending = False)
df['C=F*R'] = df['Frecuencia'] * df['Ranking']

df.sort_values(['Frecuencia'], ascending=False).head()


Unnamed: 0,Frecuencia,Ranking,C=F*R
que,21475.0,1.0,21475.0
de,18298.0,2.0,36596.0
y,18188.0,3.0,54564.0
la,10363.0,4.0,41452.0
a,9823.0,5.0,49115.0


In [7]:
df['C=F*R'].describe()

count    22478.000000
mean     17478.203043
std       7581.511869
min      11579.000000
25%      11579.000000
50%      16264.000000
75%      21080.000000
max      57687.000000
Name: C=F*R, dtype: float64

In [8]:
def zipfLoglog(data):
    df = data['df']
    title = data['title']
    
    x = np.log(df['Ranking'])
    y = np.log(df['Frecuencia'])

    fit = np.polyfit(x, y, deg = 1)
    fitted = fit[0] * x + fit[1]

    fig = plt.Figure(figsize = (4,4), facecolor = 'W', edgecolor = 'W')
    ax = plt.subplot(111)

    ax.set_title(title)
    ax.set_xlabel('Log(Ranking)')
    ax.set_ylabel('Log(Frecuencia)')

    ax.plot(x, y, 'bo', label="Real")
    ax.plot(x,fitted,'r', label="Estimado")
    ax.legend(loc='upper center', shadow=True)

    return ax

In [9]:
def zipfLineal(data):
    df = data['df']
    title = data['title']
    
    x = df['Ranking']
    y = df['Frecuencia']

    fig = plt.Figure(figsize = (4,4), facecolor = 'W', edgecolor = 'W')
    ax = plt.subplot(111)

    ax.set_title(title)
    ax.set_xlabel('Ranking')
    ax.set_ylabel('Frecuencia')

    ax.plot(x, y, 'bo')
    
    return ax

In [10]:
# MUESTRO Y GUARDO ZIPF LINEAL SIN PODA
data = {}
data['df'] = df
data['title'] = 'Ley de Zipf - Lineal\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLineal(data)
plt.show()
ax.get_figure().savefig('zipf_lineal.png', bbox_inches='tight')

In [11]:
# MUESTRO Y GUARDO ZIPF lOGLOG SIN PODA
data['df'] = df
data['title'] = 'Ley de Zipf - Loglog \nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLoglog(data)
plt.show()
ax.get_figure().savefig('zipf_loglog.png', bbox_inches='tight')

In [12]:
# PARA PODA AGREGO COLUMNA DE FRECUENCIA ACUMULADA
n = df['Frecuencia'].sum()
df['Frec. Acum.'] = df['Frecuencia'].sort_values(ascending=False).cumsum()
df['Frec. Rel.'] = df['Frecuencia'] / n
df['Frec. Rel. Acum.'] = df['Frec. Acum.'] / n
df.sort_values(['Frecuencia'], ascending=False).to_csv('extras/quijote2.csv')
df.sort_values(['Frecuencia'], ascending=False).head()

Unnamed: 0,Frecuencia,Ranking,C=F*R,Frec. Acum.,Frec. Rel.,Frec. Rel. Acum.
que,21475.0,1.0,21475.0,21475.0,0.056333,0.056333
de,18298.0,2.0,36596.0,39773.0,0.047999,0.104332
y,18188.0,3.0,54564.0,57961.0,0.047711,0.152043
la,10363.0,4.0,41452.0,68324.0,0.027184,0.179227
a,9823.0,5.0,49115.0,78147.0,0.025768,0.204995


In [46]:
#PODA DEL 5% - LINEAL
data['df'] = df[(df['Frec. Rel. Acum.'] <= 0.975) & (df['Frec. Rel. Acum.'] >= 0.025)]
data['title'] = 'Ley de Zipf - Lineal - Poda del 5%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLineal(data)
plt.show()
ax.get_figure().savefig('zipf_lineal_poda_05.png', bbox_inches='tight')

In [47]:
#PODA DEL 5% - Loglog
data['title'] = 'Ley de Zipf - Loglog - Poda del 5%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLoglog(data)
plt.show()
ax.get_figure().savefig('zipf_loglog_poda_05.png', bbox_inches='tight')


In [48]:
#PODA DEL 10% - LINEAL
data['df'] = df[(df['Frec. Rel. Acum.'] <= 0.95) & (df['Frec. Rel. Acum.'] >= 0.05)]
data['title'] = 'Ley de Zipf - Lineal - Poda del 10%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLineal(data)
plt.show()
ax.get_figure().savefig('zipf_lineal_poda_10.png', bbox_inches='tight')

In [49]:
#PODA DEL 10% - Loglog
data['title'] = 'Ley de Zipf - Loglog - Poda del 10%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLoglog(data)
plt.show()
ax.get_figure().savefig('zipf_loglog_poda_10.png', bbox_inches='tight')


In [50]:
#PODA DEL 15% - LINEAL
data['df'] = df[(df['Frec. Rel. Acum.'] <= 0.925) & (df['Frec. Rel. Acum.'] >= 0.075)]
data['title'] = 'Ley de Zipf - Lineal - Poda del 15%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLineal(data)
plt.show()
ax.get_figure().savefig('zipf_lineal_poda_15.png', bbox_inches='tight')

In [51]:
#PODA DEL 15% - LOGLOG
data['title'] = 'Ley de Zipf - Loglog - Poda del 15%\nEl ingenioso hidalgo don Quijote de la Mancha'
ax = zipfLoglog(data)
plt.show()
ax.get_figure().savefig('zipf_loglog_poda_15.png', bbox_inches='tight')