In [1]:
import numpy as np
import pandas as pd
import random
import string
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
import math
import json
from nltk import tokenize
import collections
import re
import itertools
import nltk
from scipy.stats import mannwhitneyu

import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel,\
        GenericLikelihoodModelResults

from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.special import zeta
from scipy.stats import binom
from scipy.special import zeta
from scipy.misc import derivative
from scipy.stats import mannwhitneyu
from collections import Counter


import warnings
warnings.filterwarnings("ignore")
from collections import Counter
import numpy as np
import numpy.random as rand
from scipy.special import zeta
from scipy.misc import derivative
from nltk import tokenize
from keras.preprocessing.text import Tokenizer
import numpy as np
lg = np.log10

In [2]:
def remove_punctuation(text):
    text = text.lower()
    chars_to_remove = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    tr = str.maketrans(" ", " ", chars_to_remove)
    return text.translate(tr)


def preprocess(corpus, sent = True):
    if sent:
        corpus = tokenize.sent_tokenize(corpus)
        corpus = [remove_punctuation(sent).split() for sent in corpus]
    else:
        corpus = remove_punctuation(corpus).split()
    return corpus

def make_file(corp, multi = False, sent = True, pos = False):
    if multi:
        corpus = ''
        for subcorp in corp:
            corpus += subcorp
    else:
        corpus = corp
        
    if pos:
        corpus = part_of_speech(corpus)
    
    else:
        corpus = preprocess(corpus, sent = sent)
    
    return corpus

def subsampling(corpus, k = 1000000, m = 10, sent = True):
    n = len(corpus)
    
    sen_len = {}

    
    rank_corpera = []
    freq_corpera = []

    if sent:
        for i in range(m):
            used_rank = set()
            used_freq = set()
            rank_count = 0
            freq_count = 0
            rank_samples = []
            freq_samples = []

            while rank_count < k:
                index = np.random.randint(n)
                if index in used_rank:
                    continue

                rank_sample = corpus[index]
                len_sample = len(rank_sample)

                if len_sample == 0:
                    continue

                rank_samples += rank_sample
                rank_count += len_sample

                if len_sample not in sen_len and len_sample < 200:
                    sen_len[len_sample] = 1
                elif len_sample < 200:
                    sen_len[len_sample] += 1


                used_rank.add(index)

            while freq_count < k:
                index = np.random.randint(n)
                if index in used_freq:
                    continue
                freq_sample = corpus[index]
                len_sample = len(freq_sample)

                if len_sample == 0:
                    continue

                freq_samples += freq_sample
                freq_count += len_sample

                if len_sample not in sen_len and len_sample < 200:
                    sen_len[len_sample] = 1
                elif len_sample < 200:
                    sen_len[len_sample] += 1

                used_freq.add(index)

            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)


    else:
        for i in range(m):
            rank_samples = random.sample(corpus, k)
            freq_samples = random.sample(corpus, k)
            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)

    return rank_corpera, freq_corpera


def calculate_freqs(freq_sents):
    freq_dict = {}
    for i, corpus in enumerate(freq_sents):
        freq_dict[i] = collections.Counter(corpus)
        
    freqs_df = pd.DataFrame(freq_dict)
    freqs_df = freqs_df.fillna(0)
    
    return freqs_df


def ranks_freqs(freq_sents, rank_sents):
    freqs_df = calculate_freqs(freq_sents)
    freqs_df['Frequency'] = mean_freqs(freqs_df)
    ranks_df = calculate_ranks(rank_sents)
    ranks_df['Rank'] = mean_ranks(ranks_df)
    
 
    ranks_freqs_df = pd.concat([ranks_df, freqs_df], axis = 1)
    ranks_freqs_df = ranks_freqs_df.dropna()

    return ranks_freqs_df

def mean_freqs(freqs_df):
    return(freqs_df.mean(axis=1))

def calculate_ranks(rank_sents):
    ranks_dicts = {}
    for i, corpus in enumerate(rank_sents):
        freqs = collections.Counter(corpus)
        ranks_dicts[i] = {w: r for r, (w, c) in enumerate(freqs.most_common(), 1)}
        
    ranks_df = pd.DataFrame(ranks_dicts)
    for column in ranks_df:
        min_rank = int(np.ceil(ranks_df[column].max() + 1))
        nan_rows = ranks_df[ranks_df[column].isnull()]
        num_nans = len(nan_rows)
        nan_ranks = list(range(min_rank, min_rank+num_nans))
        random.shuffle(nan_ranks)
        ranks_df.loc[ranks_df[column].isnull(), column] = nan_ranks

    return ranks_df

def mean_ranks(ranks_df):
    return ranks_df.mean(axis=1)

class Mandelbrot(GenericLikelihoodModel):

    def __init__(self, frequencies, ranks, **kwargs):
        if not len(frequencies) == len(ranks):
            raise ValueError("NOT THE SAME NUMBER OF RANKS AND FREQS!")
        
        frequencies = np.asarray(frequencies)
        ranks = np.asarray(ranks)
        
        self.n_obs = np.sum(frequencies)
        
        super().__init__(endog=frequencies, exog=ranks, **kwargs)
        self.fit_result = None
    

    def prob(self, params, ranks=None, log=False):
        if ranks is None:
            ranks = self.exog
        
        alpha, beta = params
        if log:
            return -alpha*lg(beta+ranks) - lg(zeta(alpha, q=beta+1.))
        else:
            return ((beta + ranks)**(-alpha))/zeta(alpha, q=beta+1.)
    
    
    def loglike(self, params):
        rs = self.exog
        fs = self.endog
        alpha, beta = params

        log_probs = -alpha*lg(beta+rs) - lg(zeta(alpha, q=beta+1.))
        log_probs = log_probs.reshape(-1, )
        return np.sum(fs * log_probs) - beta**5
    
    
    def register_fit(self, fit_result, overwrite=False):
        if not self.fit_result is None and not overwrite:
            raise ValueError("A fit result is already registered and overwrite=False!")
            
        self.fit_result = fit_result
        self.optim_params = fit_result.params
        self.pseudo_r_squared = self.pseudo_r_squared(self.optim_params)
        self.SE, self.SE_relative = fit_result.bse, fit_result.bse/self.optim_params
        self.BIC, self.BIC_relative = fit_result.bic,\
                            (-2*self.null_loglike())/fit_result.bic

        return self.optim_params
    
    def print_result(self, string=False):
        if self.fit_result is None:
            raise ValueError("Register a fitting result first!")

        def format_x(x):
            return float('{0:.3g}'.format(x))


        s = "="*50
        s += "\n" + "MANDELBROT"
        s += "\n" + "  Optimal Parameters " + str(tuple(map(format_x, self.optim_params)))
        
        s += "\n" + "  Standard Error [relative]: " + str(tuple(map(format_x, self.SE))) +\
              ", [" + str(tuple(map(format_x, self.SE_relative))) + "]"
        
        s += "\n" + "  Pseudo R^2: " + str(format_x(self.pseudo_r_squared))
        
        s += "\n" + "  BIC [relative]: " + str(format_x(self.BIC)) +\
              ", [" + str(format_x(self.BIC_relative)) + "]"
        s += "\n" + "="*50
        
        if string:
            return s
        
        print(s)
    
    
    def null_loglike(self, epsilon=1e-10):
        return self.loglike((1.+epsilon, 0.0))
    
    def pseudo_r_squared(self, params):
        return 1-self.loglike(params)/self.null_loglike()
    
    
    def predict(self, params, ranks=None, freqs=True, n_obs=None, 
                correct_for_finite_domain=True):
        if ranks is None:
            ranks = self.exog
        ranks = np.asarray(ranks)
        
        if n_obs is None:
            n_obs = self.n_obs
            
        alpha, beta = params
        pred_probs = self.prob(params, ranks=ranks, log=False)
        
        if correct_for_finite_domain:
            if not freqs:
                raise NotImplementedError("Correction for "\
                                          "finite domain not implemented with probabilities!")
            return pred_probs*(n_obs/np.sum(pred_probs))
        
        if freqs:
            return n_obs*pred_probs
        
        return pred_probs

def zipfs_law(df):
    mandelbrot = Mandelbrot(df['Frequency'], df['Rank'])
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), # [1.0, 1.0]
                                method="powell", full_output=True, disp=0)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    
    model_params = mandelbrot.optim_params
    alpha, beta =  mandelbrot.optim_params
    preds = mandelbrot.predict(model_params, df['Rank'])
    df['Estimated frequency'] = preds
    df['Rank (log)'] = np.log(df['Rank'])
    df['Frequency (log)'] = np.log(df['Frequency'])
    df['Estimated frequency (log)'] = np.log(df['Estimated frequency'])
    df['Error'] = df['Frequency (log)'] - df['Estimated frequency (log)']
    return mandelbrot, df

def zipf_entropy(alpha, dx=1e-10):
    if alpha <= 1.0:
        raise ValueError("Entropy undefined for the given parameter:\n" + 
                         str(alpha))
    return alpha*(-derivative(zeta, alpha, dx=dx))/zeta(alpha) + lg(zeta(alpha))

def mandelbrot_entropy(alpha, beta, dx=1e-10):
    if alpha <= 1.0 or beta <= 1.0:
        raise ValueError("Entropy undefined for the given parameters:\n" + 
                         str(alpha) + " and " + str(beta))
    zeta_b = lambda a: zeta(a, beta+1)
    return alpha*(-derivative(zeta_b, alpha, dx=dx))/zeta_b(alpha) + lg(zeta_b(alpha))


def neg_log_likelihood(zipf_model, ranks, freqs):
    mle_params = zipf_model.optim_params
    log_rank_probs = zipf_model.prob(params=mle_params, ranks=ranks, log=True)    
    return -freqs*log_rank_probs
    
    
def empirical_entropy(zipf_model, joint_rank_freqs):
    rs = list(joint_rank_freqs["Rank"])
    fs = list(joint_rank_freqs["Frequency"])
    ranks = np.asarray(rs)
    freqs = np.asarray(fs)
    n = np.sum(freqs)
    return (1/n)*np.sum(neg_log_likelihood(zipf_model, ranks, freqs))

def typicality(zipf_model, joint_rank_freqs):
    mle_params = zipf_model.optim_params
    return mandelbrot_entropy(*mle_params) - empirical_entropy(zipf_model, joint_rank_freqs)

k = 400000
m = 6

In [5]:
# Calculates typicality of corpus

# Change text to desired corpus
text = '1000000_0_0'

file = 'CLEAN_' + text + '.txt'
path = 'data/' + text + '/'


with open(path + file , encoding="utf8") as handle:
    test =  [l.strip() for l in handle.readlines()]
    
test1 = []
for arg in test:
    test1.append(arg + '.')
    
test = test1
sep_corps = [make_file(corpus, multi = True) for corpus in test]

sep_corps1 = []
for x in sep_corps:
    sep_corps1.append(x[0])
len(sep_corps1)

k = 400000
m = 6

rank_corpora, freq_corpora = subsampling(sep_corps1, k=k, m=m)
ranks_freqs_df = ranks_freqs(rank_corpora, freq_corpora)
mand, df = zipfs_law(ranks_freqs_df)
typ = typicality(mand, ranks_freqs_df)
print(typ)

MANDELBROT
  Optimal Parameters (1.14, 3.84)
  Standard Error [relative]: (0.000344, 0.0246), [(0.000303, 0.0064)]
  Pseudo R^2: 0.654
  BIC [relative]: 3670000.0, [2.89]
6.115576226257691


In [6]:
data = ' '.join(test)
word_tokeniser = Tokenizer()
word_tokeniser.fit_on_texts([data])
vocab_size = len(word_tokeniser.word_index)
vocab_size

226924