<a href="https://colab.research.google.com/github/WebiksInc/data-explorer/blob/main/function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries, and utility function

In [2]:
# add missing package
try:
    from gensim.models.nmf import Nmf
except ImportError as e:
    !pip install -U gensim

try:
    import langdetect
except ImportError as e:
    !pip install langdetect

In [7]:
# import libraries
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from scipy.special import rel_entr

from gensim.corpora import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from collections import Counter
from tqdm import tqdm

tqdm.pandas()

from langdetect import detect_langs
from operator import itemgetter
import itertools
import re
import os
import glob

import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 500)

In [8]:
# Utility function


def load_parquet_plot_distance(dirc: str, file: str):
    path = os.path.join(path_save, "without stopwords", dirc, file)
    return pd.read_parquet(path)


def remove_niqqud_from_string(my_string):
    return "".join(["" if 1456 <= ord(c) <= 1479 else c for c in my_string])


def save_pickle(obj, file_name, dirc):
    global path_save
    dirc = str(dirc).split(".")[0]
    with open(os.path.join(path_save, dirc, file_name), "wb") as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)


def save_parquet(obj, file_name, dirc):
    global path_save
    dirc = str(dirc).split(".")[0]
    obj.to_parquet(os.path.join(path_save, dirc, file_name))


def update_word_dict(line, n, words):
    # Update the word dict for count ngram on the corpus.
    line = line.split(" ")
    line = [i for i in line if (i != '"') & (len(i) > 0)]
    line = list(map(remove_niqqud_from_string, line))
    n_grams = zip(*(line[i:] for i in range(n)))
    temp = Counter(n_grams)
    words.update(temp)


def detect_langs_fun(x):
    # The warper for detect_langs function will not stop the function if it returns an error.
    try:
        return detect_langs(x)[0].lang
    except:
        return "error"

# main code

In [9]:
# Change to the path to save the data
path_save = ''

In [14]:
# Dummy database. We sampled 10 entries from Wikipedia, and performed the following manipulations: remove stopwords, lemmatization (with trankit) and stopwords + lemmatization

# flag = 'raw'
# flag = 'lemmatization'
flag = 'without stopwords'
# flag = 'without stopwords & lemmatization'

if flag == 'raw':
  df = pd.read_csv('https://raw.githubusercontent.com/NNLP-IL/data-explorer/main/wiki%20samples/wiki%20row.csv',sep = '\t')
elif flag == 'lemmatization':
  df = pd.read_csv('https://raw.githubusercontent.com/NNLP-IL/data-explorer/main/wiki%20samples/wiki%20lemmatization.csv',sep = '\t')
elif flag == 'without stopwords':
  df = pd.read_csv('https://raw.githubusercontent.com/NNLP-IL/data-explorer/main/wiki%20samples/wiki%20without%20stopwords.csv',sep = '\t')
elif flag == 'without stopwords & lemmatization':
  df = pd.read_csv('https://raw.githubusercontent.com/NNLP-IL/data-explorer/main/wiki%20samples/wiki%20lemmatization%20and%20without%20stopwords.csv',sep = '\t')


In [15]:
# function distance plot


def lng_distubtion_function(dis_word1, dis_word2) -> float:
    n = 5000  # tunable parameter. the number of words to keep

    df_a = dis_word1[
        (dis_word1["word"].isin(dis_word1["word"].head(n)))
        | (dis_word1["word"].isin(dis_word2["word"].head(n)))
    ]
    df_b = dis_word2[
        (dis_word2["word"].isin(dis_word2["word"].head(n)))
        | (dis_word2["word"].isin(dis_word1["word"].head(n)))
    ]
    df_a = df_a[df_a["word"].isin(df_b["word"])]
    df_b = df_b[df_b["word"].isin(df_a["word"])]

    df_a = df_a.sort_values("word")["frequency"]
    df_b = df_b.sort_values("word")["frequency"]
    return sum(rel_entr(list(df_a), list(df_b))) + sum(rel_entr(list(df_b), list(df_a)))


def distance_plot(main_path):
    # Calculates the distance between each pair of corpuses

    corpus = np.sort(os.listdir(os.path.join(path_save, "without stopwords")))
    corpus = [x.split(".")[0] for x in corpus]  # remove csv ending

    result = pd.DataFrame(index=corpus, columns=corpus)
    for index in itertools.combinations(corpus, 2):
        dis_word1 = load_parquet_plot_distance(index[0], "ngram 1")
        dis_word1["frequency"] = dis_word1["frequency"] / dis_word1["frequency"].sum()
        dis_word2 = load_parquet_plot_distance(index[1], "ngram 1")
        dis_word2["frequency"] = dis_word2["frequency"] / dis_word2["frequency"].sum()
        result.loc[index[0], index[1]] = lng_distubtion_function(dis_word1, dis_word2)
    with open(
        os.path.join(base_path, "corpus", "main", "distance_plot.pickle"), "wb"
    ) as handle:
        pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
# Function process data


def number_lines(df: pd.Series) -> int:
    num_line = df.shape[0]
    return num_line


def len_word_in_line(line: str) -> list:
    # The length of each word in the line
    line = line.replace(".", " ")
    line = " ".join(line.split())  # remove multiple spaces
    line = line.split(" ")
    return [len(i) for i in line]


def avg_len_word(df: pd.Series) -> int:
    # Return the average length of all words in the series
    lines = df.apply(lambda x: len_word_in_line(x))
    avg_word = np.mean([x for xs in lines for x in xs])
    return avg_word


def number_word_in_line(df: pd.Series) -> dict:
    df = df.apply(lambda x: " ".join(x.split()))  # remove multiple spaces
    result = df.apply(lambda x: len(x.split(" ")))
    return {"mean": np.round(result.mean(), 2), "median": result.median()}


def char_freq(df: pd.Series) -> dict:
    # Distribution of all characters in the corpus
    x = df.tolist()
    x = Counter("".join(x))
    x = dict(sorted(x.items(), key=lambda item: item[1]))
    return x


def lexical_density(dirc: str):
    # check for lexical_density. we defined list of stop words here:
    # https://github.com/NNLP-IL/Stop-Words-Hebrew
    # with the following parts of speech: DET,ADP,PRON, CCONJ, SCONJ
    dct_lxl = pd.read_csv(
        'https://raw.githubusercontent.com/NNLP-IL/data-explorer/main/Lexical_density.txt'
    )
    dct_lxl = dct_lxl["stopswords"].to_list()
    a = os.path.join(path_save, dirc, "ngram 1.pickle")
    path_parquet = os.path.join(path_save, dirc, "ngram 1")
    word_freq = pd.read_parquet(path_parquet)
    lxl = (
        word_freq[word_freq["word"].isin(dct_lxl)]["frequency"].sum()
        / word_freq["frequency"].sum()
    )
    return lxl


def identity_duplicate_line(df: pd.Series) -> int:
    return df.duplicated().sum()


def detect_lang_croup(df: pd.Series) -> None:
    # Verify whether the text contains any other languages besides Hebrew. We conduct a word-level check since some languages share characters.
    # In Explorer, we aggregate all languages other than Hebrew, so we are not able to perform word-level testing for those languages.
    # There is an easier solution to this problem by using regax to check the character level
    result = df.apply(lambda x: detect_langs_fun(x))
    result = result.value_counts()
    result = result.to_frame().reset_index()
    return result


def Zipf_law(df: pd.Series) -> pd.DataFrame:
    # Calculation zipf law.

    # Preparations
    df = df.replace("[^\u0590-\u05fe]", " ", regex=True)

    # find freq words
    vec = CountVectorizer(ngram_range=(1, 1)).fit(df)
    bag_of_words = vec.transform(df)
    dat = pd.DataFrame(
        zip(vec.get_feature_names(), bag_of_words.sum(axis=0).tolist()[0])
    )
    dat.columns = ["words", "freq"]
    dat = dat.sort_values("freq", ascending=False)
    dat = dat.reset_index(drop=True).reset_index(drop=False)
    dat = dat.rename(columns={"index": "rank"})
    dat["rank"] = dat["rank"] + 1

    # calculate zipf law
    dat["freq"] = dat["freq"] / dat["freq"].sum()
    dat["Zipf"] = dat.loc[0, "freq"] / dat["rank"]
    dat["Zipf"] = dat["Zipf"] / dat["Zipf"].sum()

    return dat

In [25]:
# Extract topics
def coherence_check(corpus, index, dictionary, texts) -> int:
    nmf = Nmf(
        corpus=corpus,
        num_topics=index,
        id2word=dictionary,
        chunksize=1000,
        passes=5,
        kappa=0.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42,
    )

    cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence="c_v")
    return round(cm.get_coherence(), 5)


def pick_best_number_topic(df: pd.DataFrame) -> int:
    # Decide on the best topic number.
    # In this case, we mean that it is the highest, but not higher than a small number topic by 1.1.
    df["shift"] = df["num"].shift(1)
    df["shift"] = df["shift"] > df["num"]
    df["shift"] = df["shift"].cumsum()
    df = df[df.index == df["shift"]]

    df["shift"] = df["score"].shift(1)
    df["shift"] = df["shift"] / df["score"]
    df["improve 10%"] = df["shift"] > 1.1
    df["temp"] = df["improve 10%"].cumsum()
    best_num_topics = df[df["temp"] == df["temp"].min()].tail(1)["num"].item()
    return best_num_topics


def fine_tuning_number_topics(corpus, dictionary, texts) -> int:
    topic_nums = list(np.arange(5, 45 + 1, 5))
    coherence_scores = []
    for index in topic_nums:
        print('check '  + str(index) + ' topics')
        temp = coherence_check(corpus, index, dictionary, texts)
        coherence_scores.append(temp)

    scores = list(zip(topic_nums, coherence_scores))
    temp = pd.DataFrame(
        sorted(scores, key=itemgetter(1), reverse=True), columns=["num", "score"]
    )
    k = pick_best_number_topic(temp)
    return k


def topic_model(df: pd.Series):
    # prefom NMF  topic analysis
    texts = df.apply(lambda x: x.split(" "))

    vector = TfidfVectorizer(
        min_df=3,
        max_df=0.85,
        max_features=5000,
        ngram_range=(1, 2),
        preprocessor=" ".join,
    )
    tfidf = vector.fit_transform(texts)
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=3, no_above=0.85, keep_n=5000)
    corpus = [dictionary.doc2bow(text) for text in texts]

    k_topic = fine_tuning_number_topics(corpus, dictionary, texts)

    terms = vector.get_feature_names()
    result = []

    nmf = NMF(n_components=k_topic)
    nmf.fit(tfidf)
    for i in range(0, k_topic):
        word_list = []
        for j in nmf.components_.argsort()[i, -9:-1]:  # Specifies the number of words.
            word_list.append(terms[j])
        result.append(word_list)
    return pd.DataFrame(result)

In [20]:
# Ngram function
def calculation_gini_index(dat: pd.Series) -> int:
    sorted_x = np.sort(dat)
    n = len(sorted_x)
    cumx = np.cumsum(sorted_x, dtype=float)
    gini = (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
    return gini


def ngram_fun(df: pd.Series, ngram: int, name: str) -> dict:
    # Custom function to ngram. We created a function to address the memory issue that arises in large databases.
    # By using our function, you are able to go line by line without having to worry about memory issues
    # We also delete all words that appear one time. Most of it is junk, and it takes a great deal of memory.

    # Preparations
    df = df.replace('[^\u0590-\u05fe"]', " ", regex=True)  # keep only hebrew character
    # df = df.str.replace('[^\w\s"]',' ') # remove punctuation

    # main code
    words = Counter()  # create dictionary
    not_important = df.astype(str).progress_apply(
        lambda x: update_word_dict(x, ngram, words)
    )  # update the dictionary
    full_size = len(words)
    # return words
    ngram_dict = {
        " ".join(k): v for k, v in words.items() if v != 1
    }  # keep only appear twice or more
    word_one_apper_shape = full_size - len(ngram_dict)

    ngram_df = pd.DataFrame(ngram_dict.items(), columns=["word", "frequency"])
    ngram_df = ngram_df.sort_values("frequency", ascending=False)

    # gini index
    gini = calculation_gini_index(ngram_df["frequency"])

    return {
        "name": name,
        "shape unique words": word_one_apper_shape + ngram_df.shape[0],
        "shape appeared once": word_one_apper_shape,
        "Percent appeared once": word_one_apper_shape
        / (word_one_apper_shape + ngram_df.shape[0]),
        "top unqiue words": ngram_df,
        "type-token ratio": (word_one_apper_shape + ngram_df.shape[0])
        / (word_one_apper_shape + ngram_df["frequency"].sum()),
        "Number Of Words": (word_one_apper_shape + ngram_df["frequency"].sum()),
        "gini": gini,
    }

In [None]:
# Runs all the above functions, and saves the results


def loop_on_function(df: pd.DataFrame, dirc: str):
    df = df[df["line"].notnull()]
    df["line"] = df["line"].astype("str")

    # create folder
    global path_save
    if not os.path.exists(os.path.join(path_save, dirc)):
      os.mkdir(os.path.join(path_save, dirc))
    # len_word
    print("avg_len_word")
    temp = avg_len_word(df["line"])
    save_pickle(temp, "len words.pickle", dirc)
    # Zipf_law
    print("Zipf_law")
    temp = Zipf_law(df["line"])
    save_pickle(temp, "Zipf law.pickle", dirc)
    # number lines
    print("number lines")
    temp = number_lines(df["line"])
    save_pickle(temp, "number lines.pickle", dirc)
    # Character distribution
    print("Character distribution")
    temp = char_freq(df["line"])
    save_pickle(temp, "Character distribution.pickle", dirc)
    # ngram
    print("ngram")
    temp = ngram_fun(df["line"], 1, dirc.split(".")[0])
    top_words = temp.pop("top unqiue words")
    save_parquet(top_words, "ngram 1", dirc)
    save_pickle(temp, "stat.pickle", dirc)
    for x in range(2, 6):
        temp = ngram_fun(df["line"], x, dirc.split(".")[0])
        top_words = temp.pop("top unqiue words")
        save_parquet(top_words, "ngram " + str(x), dirc)
    # identity duplicate line
    print("identity duplicate")
    temp = identity_duplicate_line(df["line"])
    save_pickle(temp, "identity duplicate line.pickle", dirc)
    # topic model
    print("topic model")
    temp = topic_model(df["line"])
    save_pickle(temp, "topic model.pickle", dirc)
    # number word in line
    print("number word in line")
    temp = number_word_in_line(df["line"])
    save_pickle(temp, "number word in line.pickle", dirc)
    # Language recognition in corpus
    print("Language recognition in corpus")
    temp = detect_lang_croup(df["line"])
    save_pickle(temp, "Language recognition in corpus.pickle", dirc)
    # lexical_density
    print("lexical_density")
    temp = lexical_density(dirc.split(".")[0])
    save_pickle(temp, "lexical_density.pickle", dirc)


loop_on_function(df,flag)