<a href="https://colab.research.google.com/github/WebiksInc/data-explorer/blob/add-stop-word/function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pickle

pd.set_option('display.max_rows', 500)

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import IPython.display
from IPython.display import display, clear_output
import os

import warnings
warnings.filterwarnings('ignore')

# main_path = r'C:/Users/user/My Drive (roei_shlezinger@webiks.com)'
main_path = r'/content/drive/MyDrive'
# path_corp = main_path + r'/corpus/with stopwords'
# path_main = main_path + r'/corpus/for display/with stopwords'

path_corp = main_path + r'/corpus/without stopwords'
path_main = main_path + r'/corpus/for display/without stopwords'

process_data = False

!pip install langdetect


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# NLP IL Hebrew comparisons Dataset Explorer

## competitive analysis

In [3]:
def number_words(df:pd.Series) -> int:
    return df[df.notnull()].astype("string").str.split().apply(len).sum()

def set_cd(dirc):       
    create_folder(path_main + '/' + dirc)
    os.chdir(path_main + '/' + dirc)

def create_folder(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def plot_shape_all_data():
    if process_data:
        result = {}
        fx = lambda x: [number_words(x)]
        for root, dirs, files in os.walk(path_corp):
            for file in files:
                if (file.endswith('.csv') or file.endswith('.tsv')):
                    print(file)
                    df = pd.read_csv(path_corp + '/'+ file,sep = '\t')
                    df = df.rename(columns = {'text':'line'})
                    result[file.split('.')[0]] = fx(df['line'])
        result = pd.DataFrame.from_dict(result, orient='index').reset_index()
        result.columns = ['Name Database','Number of Words']
        result.to_csv(main_path + '/corpus/for display/main/result.csv')
    else:
        result = pd.read_csv(main_path + '/corpus/for display/main/result.csv')
        fig = go.Figure(go.Bar(x=result['Number of Words'], y=result['Name Database'],orientation='h'))
        fig.update_xaxes(type="log")
        fig.update_layout(showlegend=False,
                         title = {'text': "Number of Words (log)"})
        fig.show()
process_data = False
# process_data = True
plot_shape_all_data()

In [4]:
# function

def number_lines(df:pd.Series) -> int:
    num_line = df.shape[0]
    return num_line

def Connecting_words(x):
    x = x.replace(".", " ")
    x = ' '.join(x.split())
    x = x.split(' ')
    return [len(i) for i in x]

def len_word(df:pd.Series):
    temp = df.apply(lambda x: Connecting_words(x))
    temp = np.mean([x for xs in temp for x in xs])
    return temp


def number_word_in_line(df:pd.Series) -> dict:
    result = df.apply(lambda x: len(x.split(' ')))
    # fig = px.histogram(result, x="line")
    return {'mean' : np.round(result.mean()),
            'median' : result.median()}


def char_freq(df:pd.Series,plot = False) -> px.bar:
    from collections import Counter
    x = df.tolist()
    x = Counter(''.join(x))
    x = dict(sorted(x.items(), key=lambda item: item[1]))
    x.pop('\u200f',None) #TODO: it's ok to delete this char?
    return x
    if plot:
        fig = px.bar(x=x.values(), y=x.keys(), height=1500,width=800)
        return fig
    else:
        return x


def ngram(df:pd.Series, ngram:int) -> dict:
    from sklearn.feature_extraction.text import CountVectorizer
    df = df.replace('[^\u0590-\u05fe a-zA-Z]',' ',regex=True)
    # Preparations
    vec = CountVectorizer(ngram_range=(ngram, ngram)).fit(df)
    bag_of_words = vec.transform(df)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    dat = pd.DataFrame(words_freq, columns=['word','frequency'])
    dat = dat.sort_values('frequency',ascending = False)
    one_apper = dat[dat['frequency'] == 1]

    #gini
    sorted_x = np.sort(dat['frequency'])
    n = len(sorted_x)
    cumx = np.cumsum(sorted_x, dtype=float)
    gini = ((n + 1 - 2 * np.sum(cumx) / cumx[-1])/ n)

    return({'shape unique words' : dat.shape[0],
                'shape appeared once' : one_apper.shape[0],
                'Percent appeared once' : one_apper.shape[0]/dat.shape[0],
                'top unqiue words' : dat,
#                 'tail unique words' : one_apper.sample(number_return),
                'type-token ratio' : dat.shape[0]/dat['frequency'].sum(),
                'Number Of Words' : dat['frequency'].sum(),
                'gini' : gini
                }
        )



def identity_duplicate_line(df:pd.Series) -> int:
    return df.duplicated().sum()

def topic_model(df:pd.Series):
    from sklearn.feature_extraction.text import TfidfVectorizer
    texts = df.apply(lambda x : x.split(' '))
    vector = TfidfVectorizer(
        min_df=3,
        max_df=0.85,
        max_features=5000,
        ngram_range=(1, 2),
        preprocessor=' '.join
    )
    tfidf = vector.fit_transform(texts)
    
    from gensim.corpora import Dictionary
    from sklearn.decomposition import NMF
    from gensim.models.nmf import Nmf
    from gensim.models.coherencemodel import CoherenceModel

    dictionary = Dictionary(texts)


    dictionary.filter_extremes(
        no_below=3,
        no_above=0.85,
        keep_n=5000
    )
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Create a list of the topic numbers we want to try
    topic_nums = list(np.arange(5, 45 + 1, 5))

    # Run the nmf model and calculate the coherence score
    # for each number of topics
    coherence_scores = []

    for num in topic_nums:
        print(num)
        nmf = Nmf(
            corpus=corpus,
            num_topics=num,
            id2word=dictionary,
            chunksize=2000,
            passes=5,
            kappa=.1,
            minimum_probability=0.01,
            w_max_iter=300,
            w_stop_condition=0.0001,
            h_max_iter=100,
            h_stop_condition=0.001,
            eval_every=10,
            normalize=True,
            random_state=42
        )

        # Run the coherence model to get the score
        cm = CoherenceModel(
            model=nmf,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )

        coherence_scores.append(round(cm.get_coherence(), 5))

    from operator import itemgetter

    scores = list(zip(topic_nums, coherence_scores))
    temp = pd.DataFrame(sorted(scores, key=itemgetter(1), reverse=True))
    temp['shift'] = temp[0].shift(1)
    temp['shift'] = temp['shift'] > temp[0]
    temp['shift'] = temp['shift'].cumsum()
    temp = temp[temp.index == temp['shift']]

    temp['shift'] = temp[1].shift(1)
    temp['shift'] = temp['shift']/temp[1]
    temp['improve 10%'] = temp['shift'] > 1.1
    temp['temp'] = temp['improve 10%'].cumsum()
    best_num_topics = temp[temp['temp'] == temp['temp'].min()].tail(1)[0].item()
    
    terms = vector.get_feature_names()
    result = []
    k = best_num_topics
    nmf  = NMF(n_components = k)
    nmf.fit(tfidf)
    for i in range(0,k):
        word_list=[]
        # print("Topic%d:"% i)
        for j in nmf.components_.argsort()[i,-9:-1]:
            word_list.append(terms[j])
        result.append(word_list)
    return pd.DataFrame(result)

def topic_model_old(df:pd.Series,k = 4) -> list:
    # https://github.com/ashishsalunkhe/Topic-Modeling-using-LDA-and-K-Means-Clustering/blob/master/newsgroup.ipynb
    #important notebook
    from sklearn.feature_extraction.text import TfidfVectorizer
    text_content = df
    vector = TfidfVectorizer()
    tfidf = vector.fit_transform(text_content)

    terms = vector.get_feature_names()

    from sklearn.decomposition import NMF
    result = []

    nmf  = NMF(n_components = k)
    nmf.fit(tfidf)
    for i in range(0,k):
        word_list=[]
        # print("Topic%d:"% i)
        for j in nmf.components_.argsort()[i,-16:-1]:
            word_list.append(terms[j])
        result.append(word_list)
    return result

def co_occurrence(df:pd.Series,dirc: str) -> pd.DataFrame:
    from sklearn.feature_extraction.text import CountVectorizer

    temp = df.apply(lambda x: x.split('.')).explode()
    count_model = CountVectorizer(ngram_range=(1,1))
    X = count_model.fit_transform(temp)
    Xc = (X.T * X)
    Xc.setdiag(0)
    co_oc = pd.DataFrame(Xc.todense(), 
                        columns=count_model.get_feature_names(), 
                        index=count_model.get_feature_names())
    word_co = load_pickle(dirc,'ngram 1.pickle')['top unqiue words']['word'].head().to_list()
    result = {}
    for word in word_co:
        temp = co_oc[word].nlargest(10).to_frame().reset_index()
        temp.columns = ['number','word']
        temp = temp.sort_values('word')
        result[word] = temp
    return result


def detect_lang_croup(df:pd.Series) -> None:
    from langdetect import detect_langs
    temp = df.str.replace('[.,\/#!?$%\^&\*;:{}=\-_`~()"0-9]','',regex = True)
    temp = temp.str.replace('[\u200f]','',regex = True)
    temp = temp.replace(r'^\s*$', np.nan, regex=True) # remove cell contain only spaces
    temp = temp.replace('[]',np.nan) #remove []
    temp = temp[temp.notnull()]
    result = temp.apply(lambda x: detect_langs(x)[0].lang)
    result = result.value_counts()
    result = result.to_frame().reset_index()
    return result


def Zipf_law(df:pd.Series):
    from sklearn.feature_extraction.text import CountVectorizer
    df = df.replace('[^\u0590-\u05fe a-zA-Z]',' ',regex=True)
    # Preparations
    vec = CountVectorizer(ngram_range=(1, 1)).fit(df)
    bag_of_words = vec.transform(df)
    sum_words = bag_of_words.sum(axis=0)

    temp = pd.DataFrame(zip(vec.get_feature_names(),bag_of_words.sum(axis=0).tolist()[0]))
    temp.columns = ['words','freq']
    temp = temp.sort_values('freq',ascending= False)
    temp  = temp.reset_index(drop = True).reset_index(drop = False)
    temp = temp.rename(columns = {'index':'rank'})
    temp['rank'] = temp['rank'] + 1
    temp['Zipf'] = (temp.loc[0,'freq']/temp['rank'])

    temp['freq'] = temp['freq']/temp['freq'].sum()
    temp['Zipf'] = temp['Zipf']/temp['Zipf'].sum()
    
    return temp


def lexical_density(dirc:str):
    dct_lxl = pd.read_csv(main_path + '/Colab Notebooks/Lexical_density.csv')
    dct_lxl = dct_lxl['stopswords'].to_list()
    a = path_main + '/' + dirc + '/ngram ' + str(1) + '.pickle'
    with open(a, 'rb') as handle:
        temp1 = pickle.load(handle)['top unqiue words'].sort_values('frequency')
    lxl = temp1[temp1['word'].isin(dct_lxl)]['frequency'].sum()/temp1['frequency'].sum()
    return lxl

In [None]:
# save the data

def database_comparison(df,df_altr,fun,titl):
    if df_altr.shape[0] > 0:
        dat = pd.DataFrame({'shape' : [fun(df['line']), fun(df_altr['line'])],'name' : ['main df','alternative df']})
        fig = px.pie(dat, values='shape', names='name',title = titl)
        fig.update_traces(hoverinfo='label+percent', textinfo='value')
        return fig
    else:
        return fun(df)

def save_pickle(obj,file_name):
    import pickle
    with open(file_name, 'wb') as handle:
                pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
def save_function(df : pd.DataFrame,dirc:str):
    df = df[df['line'].notnull()]
    df['line'] = df['line'].astype('str')
    
    #len_word
    temp = len_word(df['line'])
    save_pickle(temp,'len words.pickle')
    #Zipf_law
    temp = Zipf_law(df['line'])
    save_pickle(temp,'Zipf law.pickle')
    #number lines
    temp = number_lines(df['line'])
    save_pickle(temp,'number lines.pickle')
    #Character distribution
    temp = char_freq(df['line'],True)
    save_pickle(temp,'Character distribution.pickle')
    #ngram
    for x in range(1,6):
        temp = ngram(df['line'],x)
        save_pickle(temp,'ngram ' + str(x) + '.pickle')
    # #top 5 words and another stat
    # temp = ngram(df['line'],1)
    # save_pickle(temp,'another stat.pickle')
    #identity duplicate line
    temp = identity_duplicate_line(df['line'])
    save_pickle(temp,'identity duplicate line.pickle')
    #topic model
    import time
    start = time.time()
    temp = topic_model(df['line'])
    end = time.time()
    print(end - start)  
    save_pickle(temp,'topic model.pickle')
    #number word in line
    temp = number_word_in_line(df['line'])
    save_pickle(temp,'number word in line.pickle')
    #Language recognition in corpus
    temp = detect_lang_croup(df['line'])
    save_pickle(temp,'Language recognition in corpus.pickle')
    #lexical_density
    temp = lexical_density(dirc.split('.')[0])
    save_pickle(temp,'lexical_density.pickle')

    

    
def load_database():
    files = np.sort(os.listdir(path_corp))[12:]
    for file in files:
        # if file in ['Arutz 7 artical.csv','Arutz 7 news.csv','HeBERT.csv','Hebrew Dotted Text.csv','MDTEL.csv','Morphological labeling.csv','Nature of Healing.csv','To Be Healthy articles.csv',
        # 'To Be Healthy forum.csv','ben yehuda.csv','bible.csv','doctors.csv','foodWalla.csv','foodpage.csv','haaretz.csv','infomad.csv','knesset.csv','paraShoot.csv','sentiment_Data.csv','sentiment_Data.csv'
        # ,'sport5.csv','sportWalla.csv','tapuz.csv','themarker.csv']:
        #     continue
        if (file.endswith('.csv') or file.endswith('.tsv')):
            set_cd(file.split('.')[0])
            print(file)
            df = pd.read_csv(path_corp + '/'+ file,sep = '\t')
            # df = df.sample(5)
            # if file in ['Morphological labeling.csv','sentiment_Data.csv']:
            #     df = pd.read_csv(path_corp + '/'+ file,sep = '\t')
            save_function(df,file)
load_database()

bible.csv
5
10
15
20
25
30
35
40
45
36.988131284713745
doctors.csv
5
10
15
20
25
30
35
40
45
118.83844590187073
foodWalla.csv
5
10
15
20
25
30
35
40
45
196.85246562957764
foodpage.csv
5
10
15
20
25
30
35
40
45
57.03791332244873
haaretz.csv
5
10
15
20
25
30
35
40
45
2721.2194876670837
infomad.csv
5
10
15
20
25
30
35
40
45
91.06147885322571
knesset.csv
5
10
15
20
25
30
35
40
45
1553.6615071296692
paraShoot.csv
5
10
15
20
25
30
35
40
45
33.79060411453247
sentiment_Data.csv
5
10
15
20
25
30
35
40
45
69.12683367729187
sport5.csv
5
10
15
20
25


# Dataset information

In [None]:
dirc1,ngram = 'ben yehuda',1
dirc2 = 'think IL'

def load_pickle(dirc:str,file: str):
    if stop_words_box.value:
        stop_words_path = 'without stopwords'
    else:
        stop_words_path = 'with stopwords'
    a = main_path + '/corpus/for display' + '/'+ stop_words_path + '/' + dirc + '/' + file
    with open(a, 'rb') as handle:
        return pickle.load(handle)


#data
lst_data = ['ben yehuda','think IL','paraShoot','Hebrew-Sentiment','HeBERT Emotion Recognition','wikipedia'
            ,'Arutz 7 (mila)','Doctors (mila)','Foodpage Corpus (mila)','haaretz (mila)','HaKnesset (mila)',
            'Hebrew Dotted Text (mila)','Infomed (mila)','Learning Man (mila)','Nature of Healing (mila)',
            'Spoken Israeli Hebrew (mila)','Sport5 Corpus (mila)','Tapuz People Forum (mila)','TheMarker (mila)',
            'To Be Healthy (mila)','Walla Food Corpus (mila)']
data_widget = widgets.Dropdown(
    options = lst_data,
    description='main dataset',
    value = 'ben yehuda'
)

data_widget2 = widgets.Dropdown(
    options = lst_data,
    description='secondary dataset',
    value = 'think IL'
)

#ngram
ngram_slider = widgets.IntSlider(
    description = 'n-gram',
    min=1,
    max=5,
    step=1,
    value=1
)

#stop words
stop_words_box = widgets.Checkbox(True, description='Remove Stop Words')


# co-occurrence
word_list = load_pickle(dirc1,'ngram 1.pickle')['top unqiue words']['word'].head().to_list()
co_widget = widgets.Dropdown(
    options=word_list,
    description='select word'
    )

custom_widget = widgets.Text(
    value='דוגמא',
    placeholder='Paste ticket description here!',
    description='enter word:',
    disabled=False
)


In [None]:
from plotly.subplots import make_subplots

colors = ['indianred','lightblue']

word_co = load_pickle(dirc1,'ngram 1.pickle')['top unqiue words']['word'].head().to_list()[0]
word_co2 = load_pickle(dirc2,'ngram 1.pickle')['top unqiue words']['word'].head().to_list()[0]

#Character distribution
def chr_freq_fun(dirc:str,color):
    chr_freq = load_pickle(dirc,'Character distribution.pickle')
    chr_freq_row = pd.DataFrame(chr_freq.items(),columns = ['char','freq'])
    chr_freq_row = chr_freq_row[chr_freq_row['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))]
    chr_freq_row['freq'] = chr_freq_row['freq']/chr_freq_row['freq'].sum()
    chr_freq_plt = go.Bar(marker_color=colors[color],
                          y=chr_freq_row['char'],
                          x=chr_freq_row['freq'],
                          orientation='h')
    return chr_freq_plt

chr_freq_plt = go.Figure(data=[chr_freq_fun(dirc1,0),chr_freq_fun(dirc2,1)])
chr_freq_plt.update_layout(showlegend=False)
chr_freq_plt = go.FigureWidget(chr_freq_plt)
chr_freq_plt.layout.title.text = 'Character Distribution'
chr_freq_plt.layout.title.x = 0.5
chr_freq_plt.layout.title.y = 0.85


chr_freq1 = load_pickle(dirc1,'Character distribution.pickle')
chr_freq_row1 = pd.DataFrame(chr_freq1.items(),columns = ['char','freq'])
chr_freq2 = load_pickle(dirc2,'Character distribution.pickle')
chr_freq_row2 = pd.DataFrame(chr_freq2.items(),columns = ['char','freq'])
chr_freq_not1 = chr_freq_row1[~chr_freq_row1['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))].shape[0]
chr_freq_not2 = chr_freq_row2[~chr_freq_row2['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))].shape[0]
unuse_char_plt = go.Bar(
    y=[' ','  '],
    x=[chr_freq_not1,chr_freq_not2],
    orientation='h',
    marker_color=colors)
unuse_char_plt = go.Figure(unuse_char_plt).update_xaxes(type="log")
unuse_char_plt = go.FigureWidget(unuse_char_plt)
unuse_char_plt.layout.title.text = 'Number of characters that are noise'
unuse_char_plt.layout.title.y = 0.85


#ngram
def ngram_plt_fun(dirc:str,ngram:int,number_return,clr):
    a = path_main + '/' + dirc + '/ngram ' + str(ngram) + '.pickle'
    with open(a, 'rb') as handle:
        temp = pickle.load(handle)['top unqiue words'].sort_values('frequency')
        temp['precent'] = temp['frequency']/temp['frequency'].sum()
        temp = temp.sort_values('frequency',ascending = False).head(number_return).sort_values('frequency')
        ngram_plot = go.Bar(name = clr,
                y=temp['word'],
                x=temp['precent'],
                customdata = temp['frequency'],
                orientation='h',
                hovertemplate="<b>%{y}</b> <b>Absolute Number : %{customdata}</b>",
                marker_color=colors[clr]
                )
        return ngram_plot

ngram_plot = go.Figure([ngram_plt_fun(dirc1,ngram,10,0),ngram_plt_fun(dirc2,ngram,10,1)]) 
ngram_plot.update_layout(showlegend=False)
ngram_plot = go.FigureWidget(ngram_plot)
ngram_plot.layout.title.text = 'ngram'
ngram_plot.layout.title.x = 0.5
ngram_plot.layout.title.y = 0.928

#Word frequency search
def ngram_custom_fun(dirc1:str,dric2:str,word:str):
    a = path_main + '/' + dirc1 + '/ngram ' + str(ngram) + '.pickle'
    with open(a, 'rb') as handle:
        temp1 = pickle.load(handle)['top unqiue words'].sort_values('frequency')
        temp1['precent'] = temp1['frequency']/temp1['frequency'].sum()
        temp1 = temp1[temp1['word'] == word]
    
    a = path_main + '/' + dirc2 + '/ngram ' + str(ngram) + '.pickle'
    with open(a, 'rb') as handle:
        temp2 = pickle.load(handle)['top unqiue words'].sort_values('frequency')
        temp2['precent'] = temp2['frequency']/temp2['frequency'].sum()
        temp2 = temp2[temp2['word'] == word]
    
    ngram_plot = go.Bar(y=[dirc1,dirc2],
                        x=[temp1['precent'].item(),temp2['precent'].item()],
                        customdata = [temp1['frequency'].item(),temp2['frequency'].item()],
                         hovertemplate="<b>Absolute Number : %{customdata}</b>",
                        orientation='h',
                       marker_color=colors)
    return ngram_plot
custom_plot = go.Figure(ngram_custom_fun(dirc1,dirc2,'דוגמא'))
custom_plot.update_layout(showlegend=False)
custom_plot = go.FigureWidget(custom_plot)
custom_plot.layout.title.text = 'custom ngram:'
custom_plot.layout.title.x = 0.5
custom_plot.layout.title.y = 0.85


def stat_fun(dirc):
    stat = load_pickle(dirc,'another stat.pickle')
    stat.pop('top unqiue words',None)
    stat['Average len words'] = load_pickle(dirc,'len words.pickle')
    stat['number lines'] = load_pickle(dirc,'number lines.pickle')
    stat['Percent appeared once'] = np.round(stat['Percent appeared once'],2)
    stat['type-token ratio'] = np.round(stat['type-token ratio'],2)
    stat['Average len words'] = np.round(stat['Average len words'],2)
    stat['gini'] = np.round(stat['gini'],2)
    
    #addition stat on lines
    addition = load_pickle(dirc,'number word in line.pickle')
    stat['mean line'] = addition['mean']
    stat['median line'] = addition['median']
    return stat

stat1 = stat_fun(dirc1)
stat2 = stat_fun(dirc2)

#stat
def fun_plt_stat(stat1,stat2,txt):
    plt_temp = go.Bar(y=[' ','  '],
                            x=[stat1,stat2],
                            orientation='h',
                             marker_color=colors)
    return plt_temp

unq_plt = fun_plt_stat(stat1['shape unique words'],stat2['shape unique words'],'Number Unique Words')
apr_plt = fun_plt_stat(stat1['shape appeared once'],stat2['shape appeared once'],'Number Appeares Once')
prcnt_plt = fun_plt_stat(stat1['Percent appeared once'],stat2['Percent appeared once'],'Percent Appeared Once')
nmbr_plt = fun_plt_stat(stat1['Number Of Words'],stat2['Number Of Words'],'Number Of Words')
typ_tkn_plt = fun_plt_stat(stat1['type-token ratio'],stat2['type-token ratio'],'Type-Token Ratio')
gini_plt = fun_plt_stat(stat1['gini'],stat2['gini'],'Gini')
len_words_plt = fun_plt_stat(stat1['Average len words'],stat2['Average len words'],'Average len words')
nmbr_line_plt = fun_plt_stat(stat1['number lines'],stat2['number lines'],'Number Lines')
mean_line_plt = fun_plt_stat(stat1['mean line'],stat2['mean line'],'Mean Line')
mdian_line_plt = fun_plt_stat(stat1['median line'],stat2['median line'],'Median Line')


#Language recognition in corpus
def prcnt_not_hebrew(df: pd.DataFrame):
    df['line'] = df['line']/df['line'].sum()
    df = df[df['index'] != 'he']
    temp = df['line'].sum()
    return temp
temp = load_pickle(dirc1,'Language recognition in corpus.pickle')
a = prcnt_not_hebrew(temp)
temp = load_pickle(dirc2,'Language recognition in corpus.pickle')
b = prcnt_not_hebrew(temp)
lang_plot = fun_plt_stat(a,b,'Language recognition in corpus')



#Zipf law
Zipf_plt = go.Figure()
#main
temp = load_pickle(dirc1,'Zipf law.pickle').head(20)
Zipf_plt.add_trace(go.Scatter(
                        x = temp['rank'],
                        y = temp['freq'],
                        name = 'freq ' + dirc1,
                        text = temp['words'],
                        marker_color=colors[0]),
)
#secondary
temp = load_pickle(dirc2,'Zipf law.pickle').head(20)
Zipf_plt.add_trace(go.Scatter(
                        x = temp['rank'],
                        y = temp['freq'],
                        name = 'freq ' + dirc2,
                        text = temp['words'],
                        marker_color=colors[1]),
)

Zipf_plt.add_trace(go.Scatter(
                        x = temp['rank'],
                        y = temp['Zipf'],
                        name = 'Zipf Law',
                        marker_color='gold'),
)
Zipf_plt.update_layout(showlegend=False)
Zipf_plt = go.FigureWidget(Zipf_plt)    
Zipf_plt.layout.title.text = 'Zipf Law'
Zipf_plt.layout.title.x = 0.5
Zipf_plt.layout.title.y = 0.85

#topic
temp = load_pickle(dirc1,'topic model.pickle')
topic_plt = go.Table(header=dict(values=['Topic 1','Topic 2','Topic 3','Topic 4']),
                 cells=dict(values=temp))
topic_plt = go.FigureWidget(topic_plt) 
topic_plt.layout.title.text = 'Topics'
topic_plt.layout.title.x = 0.5
topic_plt.layout.title.y = 0.85



fig = go.FigureWidget()

def response_data(change):
    dirc1 = data_widget.value
    dirc2 = data_widget2.value
    ngram_int = ngram_slider.value
    word_co = co_widget.value
    new_word = custom_widget.value
    if stop_words_box.value:
        stop_words_path = 'without stopwords'
    else:
        stop_words_path = 'with stopwords'
    response_ngram(' ')
    if word_co == 'Select':
        pass
    else:
        with fig.batch_update():
            #stat
            sstat1 = stat_fun(dirc1)
            stat2 = stat_fun(dirc2)

            fig_stat.data[0].x = [stat1['shape unique words'],stat2['shape unique words']]
            fig_stat.data[1].x = [stat1['shape appeared once'],stat2['shape appeared once']]
            fig_stat.data[2].x = [stat1['Percent appeared once'],stat2['Percent appeared once']]
            fig_stat.data[3].x = [stat1['Number Of Words'],stat2['Number Of Words']]
            fig_stat.data[4].x = [stat1['type-token ratio'],stat2['type-token ratio']]
            fig_stat.data[5].x = [stat1['gini'],stat2['gini']]
            fig_stat.data[6].x = [stat1['Average len words'],stat2['Average len words']]
            fig_stat.data[7].x = [stat1['number lines'],stat2['number lines']]
            fig_stat.data[8].x = [stat1['mean line'],stat2['shape unique words']]
            fig_stat.data[9].x = [stat1['median line'],stat2['shape unique words']]

            temp = load_pickle(dirc1,'Language recognition in corpus.pickle')
            a = prcnt_not_hebrew(temp)
            temp = load_pickle(dirc2,'Language recognition in corpus.pickle')
            b = prcnt_not_hebrew(temp)
            fig_stat.data[10].x = [a,b]
            
            #Character distribution
            ##main
            def chr_update(dirc,i):
                chr_freq = load_pickle(dirc,'Character distribution.pickle')
                chr_freq_row = pd.DataFrame(chr_freq.items(),columns = ['char','freq'])
                chr_freq = chr_freq_row[chr_freq_row['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))]
                chr_freq['freq'] = chr_freq['freq']/chr_freq['freq'].sum()
                chr_freq_plt.data[i].x = chr_freq['freq']
                chr_freq_plt.data[i].y = chr_freq['char']
                chr_freq_plt.data[i].name = dirc
            chr_update(dirc1,0)
            chr_update(dirc2,1)

            
            ##unuse
            chr_freq_not1 = chr_freq_row1[~chr_freq_row1['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))].shape[0]
            chr_freq_not2 = chr_freq_row2[~chr_freq_row2['char'].isin(list('אבגדהוזחטיכךלמםנןסעפףצץקרשת'))].shape[0]
            unuse_char_plt.data[0].x = [chr_freq_not1,chr_freq_not2]
            
            #topic model
            temp = load_pickle(dirc1,'topic model.pickle')
            topic_plt.data[0].cells = dict(values=temp)
            
            #Zipf law
            ##main
            temp = load_pickle(dirc1,'Zipf law.pickle').head(20)
            Zipf_plt.data[0].x = temp['rank']
            Zipf_plt.data[0].y = temp['freq']
            Zipf_plt.data[0].text = temp['words']
            
            ##secondary
            temp = load_pickle(dirc2,'Zipf law.pickle').head(20)
            Zipf_plt.data[1].x = temp['rank']
            Zipf_plt.data[1].y = temp['freq']
            Zipf_plt.data[1].text = temp['words']
            
            #new word custom frequency
            def update_custom(dirc:str,new_word:str):
                colors = ['indianred','lightblue']
                ngram_plot.data[0].marker.color = colors[0]
                ngram_plot.data[1].marker.color = colors[1]
                ngram_int = len(new_word.split(' '))
                if ngram_int >=5:
                    ngram_int = 1
                a = main_path + '/corpus/for display' + '/'+ stop_words_path + '/' + dirc + '/ngram ' + str(ngram_int) + '.pickle'
                with open(a, 'rb') as handle:
                    temp = pickle.load(handle)['top unqiue words'].sort_values('frequency')
                    temp['precent'] = temp['frequency']/temp['frequency'].sum()
                    temp = temp[temp['word'] == new_word]
                    if temp.shape[0] > 0:
                        return temp
                    else:
                        return pd.DataFrame({'word':[0],'frequency':[0],'precent':[0]})
            temp1 = update_custom(dirc1,new_word)
            temp2 = update_custom(dirc2,new_word)
            custom_plot.data[0].x = [temp1['precent'].item(),temp2['precent'].item()]
            custom_plot.data[0].y = [dirc1,dirc2]
            custom_plot.data[0].customdata = [temp1['frequency'].item(),temp2['frequency'].item()]
            

        

def response_ngram(change):
    dirc1 = data_widget.value
    dirc2 = data_widget2.value
    ngram_int = ngram_slider.value
    word_co = co_widget.value
    with fig.batch_update():
        #ngram
        #main
        if stop_words_box.value:
            stop_words_path = 'without stopwords'
        else:
            stop_words_path = 'with stopwords'
        a = main_path + '/corpus/for display' + '/'+ stop_words_path + '/' + dirc1 + '/ngram ' + str(ngram_int) + '.pickle'
        with open(a, 'rb') as handle:
            temp1 = pickle.load(handle)['top unqiue words'].sort_values('frequency')
            temp1['precent'] = temp1['frequency']/temp1['frequency'].sum()
            temp1 = temp1.sort_values('frequency',ascending = False).head(10).sort_values('frequency')   
            ngram_plot.data[0].x = temp1['precent']
            ngram_plot.data[0].y = temp1['word']
            ngram_plot.data[0].customdata = temp1['frequency']
        
        #secondary
        a = main_path + '/corpus/for display' + '/'+ stop_words_path + '/' + dirc2 + '/ngram ' + str(ngram_int) + '.pickle'
        with open(a, 'rb') as handle:
            temp2 = pickle.load(handle)['top unqiue words'].sort_values('frequency')
            temp2['precent'] = temp2['frequency']/temp2['frequency'].sum()
            temp2 = temp2.sort_values('frequency',ascending = False).head(10).sort_values('frequency')   
            ngram_plot.data[1].x = temp2['precent']
            ngram_plot.data[1].y = temp2['word']  
            ngram_plot.data[1].customdata = temp2['frequency']

        
        
ngram_slider.observe(response_ngram, names="value")
data_widget.observe(response_data, names="value")
data_widget2.observe(response_data, names="value")
stop_words_box.observe(response_data, names="value")
custom_widget.on_submit(response_data)

#ngram click

def update_point(trace, points, selector):
    colors = ['indianred','lightblue']
    if len(points.ys) > 0:
        word = points.ys[0]
        graph = int(trace.name)
        lst_word = trace.y
        
        #update custom plot
        custom_widget.value = word
        response_data(' ')
        
        #change bar color
        index_change = np.where(word == lst_word)[0][0]         
        if graph == 0:
            ngram_plot.data[1].marker.color = [colors[1]]*10
            color = [colors[0]]*10
        if graph == 1:
            ngram_plot.data[0].marker.color = [colors[0]]*10
            color = [colors[1]]*10
        color[index_change] = 'lightslategray'
        ngram_plot.data[graph].marker.color = color

nargm_click1 = ngram_plot.data[0]
nargm_click2 = ngram_plot.data[1]
nargm_click1.on_click(update_point)
nargm_click2.on_click(update_point)



title = ['Number Unique Words','Number Appeares Once','Percent Appeared Once','Number Of Words','Type-Token Ratio','Gini','Average len words'
,'Number Lines','Mean Line','Median Line','Language recognition in corpus']
fig_stat = make_subplots(specs=[[{"type": "bar"}]*4]*3,
            rows=3, cols=4,
            subplot_titles = title)
index = 0
lst = [unq_plt,apr_plt,prcnt_plt,nmbr_plt,typ_tkn_plt,gini_plt,len_words_plt,nmbr_line_plt,mean_line_plt,mdian_line_plt,lang_plot]
for i in range(1,4):
    for j in range(1,5):
        fig_stat.add_trace(lst[index],row=i, col=j)
        index+=1
        if index == len(lst):
            break
fig_stat.update_layout(height=600,width = 1000, showlegend=False)
fig_stat = go.FigureWidget(fig_stat)

ngram_plot.layout.height = 800
custom_plot.layout.height = 400

#ngram + custom box
custom_plot_container = widgets.VBox([custom_widget,
                             custom_plot
                                     ])

ngram_container = widgets.VBox([ngram_slider,
                                ngram_plot,
                                ])
ngram_custom_box = widgets.HBox([ngram_container,custom_plot_container])
top_widget = widgets.HBox([data_widget,data_widget2,stop_words_box])

box = widgets.HBox([chr_freq_plt,unuse_char_plt,Zipf_plt])
grid = widgets.VBox([top_widget,fig_stat,ngram_custom_box,topic_plt,box])
grid


In [None]:
# distribtion for words
import itertools
for index in itertools.combinations(lst_data, 2):
    dis_word1 = load_pickle(index[0],'ngram 1.pickle')['top unqiue words']
    dis_word1 = dis_word1['frequency']/dis_word1['frequency'].sum()
    dis_word2 = load_pickle(index[1],'ngram 1.pickle')['top unqiue words']
    dis_word2 = dis_word2['frequency']/dis_word2['frequency'].sum()
    dis_word2


    def lng_distubtion_function(a,b):
        from scipy.special import rel_entr
        if a.shape[0] > b.shape[0]:
            a = a.sample(b.shape[0]).reset_index(drop = True)
        else:
            b = b.sample(a.shape[0]).reset_index(drop = True)
        return sum(rel_entr(a, b)) + sum(rel_entr(b, a))

    lng_distubtion_function(dis_word1,dis_word2)

In [None]:
pd.DataFrame(index = lst_data,columns = lst_data)

# Focused look at a dataset

In [None]:
def over_view(df: pd.DataFrame,col : str, n : int,plot_text : str,plot = 'none'):
    temp = df[col].value_counts().reset_index().sort_values(col,ascending=False).reset_index(drop = True)
    temp = temp.replace(r'\s+', ' ', regex=True)
    temp.loc[temp['index'] == ' ','index'] = 'לא ידוע'
    sum_all = temp[col].sum()
    temp = temp[(temp[col]/temp[col].sum()).cumsum() < 0.90]
    temp = temp.append({'index': 'others',col: sum_all - temp[col].sum()}, ignore_index=True)
    return temp
        
def remove_stop_word(df:pd.Series) -> pd.Series:
    stop = pd.read_csv(r'C:\corpus\heb_stopwords.txt', sep=" ", header=None)
    df = df.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return df

# def sample(df,df_altr):
#     n=5
#     df = df.sample(n)
#     if df_altr.shape[0] > 100:
#         df_altr = df_altr.sample(n)
#     return df, df_altr
    

out = widgets.Output()
display(out)
def focused_eventhandler(change):
    os.chdir(r'C:\corpus\for display')
    global df,main,df_altr,flag
    if (change == 'ben yehuda'):
        main = pd.read_csv(r'over_view/ben yehuda.csv')
            
        fig = make_subplots(
            rows=2, cols=2,
            specs=[[{"type": "bar"}, {"type": "bar"}],
                    [{"type": "pie"}, {"type": "pie"}]],
            subplot_titles = ('authors','translators','Distribution of genres','Distribution of translated languages')
            )

        temp = over_view(main,'authors',10,' ','bar')
        fig.add_trace(go.Bar(y=temp['index'],
                                 x=temp['authors'],
                                orientation='h'),
                          row=1, col=1)

        temp = over_view(main[main['translators'].notnull()],'translators',10,'Translators','bar')
        fig.add_trace(go.Bar(y=temp['index'],
                                 x=temp['translators'],
                                orientation='h'),
                          row=1, col=2)

        temp = over_view(main,'genre',10,'Distribution of genres','pie')
        fig.add_trace(go.Pie(values=temp['genre'],
                            labels=temp['index']),
                        row=2, col=1)

        temp = over_view(main,'original_language',10,'Distribution of translated languages','pie')
        fig.add_trace(go.Pie(values=temp['original_language'],
                            labels=temp['index']),
                        row=2, col=2)

        fig.update_layout(height=700, showlegend=False)

        fig.show()
    elif (change == 'think il'):
        main = pd.read_csv(r'over_view/think il.csv')
        temp = over_view(main,'author',10,'Distribution of authors','bar')
        fig = go.Figure(
            data=[go.Bar(x=temp['author'], y=temp['index'], orientation='h')],
            layout=go.Layout(
                title=go.layout.Title(text="Distribution of authors")
            )
        )
        fig.show()
        
    elif (change == 'ParaShoot'):
        df = pd.read_csv('C:\corpus\ParaShoot\df.csv',sep = '\t')
        temp = df[['title']].value_counts().to_frame().reset_index()
        temp.columns = ['title','freq']
        fig = go.Figure(
            data=[go.Bar(y=temp['title'], x=temp['freq'], orientation='h')],
            layout=go.Layout(
                title=go.layout.Title(text="Distribution of article")
            )
        )
        fig.show()
        
    elif (change == 'Hebrew-Sentiment-Data'):
        df = pd.read_csv('C:\corpus\Hebrew-Sentiment-Data\dev.tsv',sep = '\t')
        df = df.rename(columns = {'comment':'line'})
        df = df['label'].value_counts().to_frame().reset_index()
        df.columns = ['lable','count']
        fig = go.Figure(go.Pie(values=df['count'],
                                    labels=['Positive','negative','neutral']))
        fig.show()
        
    
    elif (change == 'HeBERT Emotion Recognition'):
        df = pd.read_csv(r'C:\corpus\HeBERT Emotion Recognition\test_insample_tagged_raw_to_publish.csv')
        df = df.rename(columns = {'talkbacks':'line'})

        fig = make_subplots(
                            rows=1, cols=3,
                            specs=[[{"type": "pie"}, {"type": "pie"}, {"type": "pie"}]],
                    subplot_titles = ('Emotion Distribution','Sentiment Distribution','Source of response')
                    )
        temp = df['emotion_en'].value_counts().to_frame().reset_index()
        fig.add_trace(go.Pie(values=temp['emotion_en'],
                                    labels=temp['index']),
                                row=1, col=1)

        temp = df['value'].value_counts().to_frame().reset_index()
        fig.add_trace(go.Pie(values=temp['value'],
                                    labels=temp['index']),
                                row=1, col=2)

        temp = df['source'].value_counts().to_frame().reset_index()
        fig.add_trace(go.Pie(values=temp['source'],
                                    labels=temp['index']),
                                row=1, col=3)

        fig.update_layout(height=700, showlegend=False)
        fig.show()


def main_load_dataset(dirc: list):
    create_folder('over_view')
    #ben yehuda
    main = pd.read_csv(r'C:\corpus\ben yehuda\pseudocatalogue.csv')
    main[['authors','genre','original_language','translators']].to_csv(r'over_view/ben yehuda.csv')
    
    #think il
    main = pd.read_csv(r'C:\corpus\think il\df think IL token.tsv',sep = '\t')
    main = main.rename(columns={'text':'line','auther':'author'})
    main['author'].to_csv(r'over_view/think il.csv')

from plotly.subplots import make_subplots
focused_widget = widgets.Dropdown(options = ['ben yehuda','think il','ParaShoot','Hebrew-Sentiment-Data','HeBERT Emotion Recognition'],
                                  value = None,
                                  description='Chose data')
focused_view = widgets.interactive(focused_eventhandler, change=focused_widget)
focused_widget.observe(lambda x: focused_view.update(), 'value')
display(focused_view)