In [140]:
from IPython.display import display, Markdown, Latex
import subprocess
import re
import os
import pandas as pd

#In this variable is markdown from all jupyternotebook
global_markdown={}

def save_markdown_files(text: str, dir):
    for key, markdown in text.items():
        file_name = key + '.md'
        file = open(dir + file_name, 'w')
        
        file.write(markdown)
        file.close()

def add_extra_backslash_to_escapes_codes(t):
    ''' Automatically add extra \ before python escape characters before saving to markdown'''
  
    t=t.replace('\r', '\\r')
    t=t.replace('\b', '\\b')
    t=t.replace('\t', '\\t')
    t=t.replace('\t', '\\t')
    # t=t.replace('\n', '\\\\n')
    t=t.replace('\a', '\\a')
    t=t.replace('\f', '\\f')
    return t


def print_stats(t):
    print()
    total=count_markdown_words(t)['pure_total_words']
    
    print('Pure total words: ', total)
    print('Total time to read [minutes] : ', round(total/130, 2) )
    

    print("headline_words",  count_markdown_words(t)['headline_words'])
    print("subheadline_words", count_markdown_words(t)['subheadline_words'])
    print("table_words", count_markdown_words(t)['table_words'])
    print("figures_words", count_markdown_words(t)['figures_words'])

def count_words(words):
    count = 0
    words=add_extra_backslash_to_escapes_codes(words)
    
    #words=words
    for word in words.split():
        if not re.match("\\\\", word):
            count += 1
    return count
    
def count_markdown_words(text: str) ->dict:
    # Split the text into lines
    lines = text.split("\n")

    # Initialize variables to keep track of the number of words in headlines and subheadlines
    headline_words = 0
    subheadline_words = 0

    # Initialize a variable to keep track of the total number of words in tables
    table_words = 0

    # Initialize a variable to keep track of the total number of words in the text
    total_words = 0
    figures_words = 0
    dontskiplines=True

    # Loop through each line in the text
    for line in lines:
        # Check if the line is a headline
        #regex=re.findall(r"[^\\]\w+", line)
        #regex=count_words(line)
      
        if line.startswith("\begin{equation}"):
            dontskiplines=False
            
        if line.startswith("\end{equation}"):
            dontskiplines=True
          
        if dontskiplines:
            if line.startswith("# "):
                # Split the line into words and add the number of words to the headline_words variable
                headline_words += count_words(line)
            # Check if the line is a subheadline
            elif line.startswith("## "):
                # Split the line into words and add the number of words to the subheadline_words variable
                subheadline_words += count_words(line)
            # Check if the line is part of a table
            elif line.startswith("|"):
                # Split the line into words and add the number of words to the table_words variable
                table_words += count_words(line)
            elif line.startswith("!["):
                # Split the line into words and add the number of words to the figures_caption var
                figures_words += count_words(line)
            # If the line is not a headline, subheadline, or part of a table, add the number of words to the total_words variabl   
        
            else:
                total_words += count_words(line) #len(re.findall(r"\w+", line))

    # Return the total number of words in headlines, subheadlines, and tables, as well as the total number of words
    return {
        # total words are without tables, headlines, ans subheadlines
        "pure_total_words": total_words  ,
        "headline_words": headline_words - 1,
        "subheadline_words": subheadline_words - 1,
        "table_words": table_words,
        "figures_words": figures_words
    }


def count_markdown_words_dict(markdown: dict) -> dict:

    word_counts = {
        'pure_total_words': 0,
        'headline_words': 0,
        'subheadline_words': 0,
        'table_words': 0,
        'figures_words': 0
    }
    
    for key, value in markdown.items():
        counts = count_markdown_words(value)
        for key, value in counts.items():
            word_counts[key] += value
            
    return word_counts


def split_markdown_into_dict(text: str) -> dict:
    
    ''' You give markdown as a str and it returns markdown in dict'''
    
    # split the text into lines
    lines = text.split("\n")
    
    # initialize a dictionary to store the paragraphs and sub-paragraphs
    result = {}
    
    # initialize the current key and value
    current_key = None
    current_value = []
    
    # loop over the lines
    for line in lines:
        # if the line starts with "#", it indicates the start of a new paragraph
        # or sub-paragraph. update the current key and value accordingly.
        if line.startswith("#"):
            if current_key is not None:
                result[current_key] = "\n".join(current_value)
            current_key = line
            current_value = []
        
        # if the line does not start with "#", it is part of the current paragraph
        # or sub-paragraph. append it to the current value.
        else:
            current_value.append(line)
    
    # add the final paragraph or sub-paragraph to the result
    result[current_key] = "\n".join(current_value)
    
    return result

def split_markdown_into_dict_level_one(markdown: dict) -> dict:
    ''' Order dict that all the text from headline level 1 
    will be in each key '''

    first_level = {}

    for key, value in markdown.items():
        if key.startswith("# "):
            first_level[key] = value
            current_key = key
        else:
            first_level[current_key]= first_level[current_key]+ "\n" + key + "\n" + value 

    return first_level



def run_subprocess(command, pdftex):
    subprocess.call(command, cwd=pdftex, stdout=open(os.devnull, 'wb'))
    
def add_wordcount_to_markdown(markdown: dict) -> str:
    
    ''' Takes markdown and insert word count for every paragraph and subparagraph '''
    
    # initialize the result string
    result = ""
    
    # loop over the keys and values in the dictionary
    for title, text in markdown.items():
        # count the number of words in the text
        words = count_markdown_words(text)['pure_total_words']
        
        
        time_to_read_str= str(words) + " words, around " + str(round(words/130, 2)) + " minutes of reading "
        time_on_right=""
        if words > 0:
            time_on_right='''
\\begin{flushright}
        %s
\end{flushright}
    ''' % time_to_read_str
        
        # add the title, word count, and estimated reading time to the result
        result += title + "\n"
        result += str(time_on_right)
        result += str(text) + "\n"
    
    return result

    
# Main function:
    
def saveText(markdown: str, headline: str):
    # set directory to save md files
    dir = 'latex/md/'
    # create one if it's missing
    if not os.path.exists(dir):
        os.makedirs(dir)
    
    
    markdown = add_extra_backslash_to_escapes_codes(markdown)
    
    #markdown_from_cell = { headline: markdown }
    markdown = split_markdown_into_dict(markdown)
    
    global global_markdown 
    global_markdown = global_markdown | markdown

   # markdown = split_markdown_into_dict(markdown)
    text = {headline: add_wordcount_to_markdown(markdown)}
    
    save_markdown_files(text, dir)
    file = open(dir + 'global_markdown.md', 'w')    
    # Count total words by 
    file.write(
                add_wordcount_to_markdown(
                split_markdown_into_dict_level_one(
                global_markdown
                                                  )
                                         )
               )
    
    
    print_stats(t)

    
    #run_subprocess(command, pdftex))


# Test 1

In [141]:

t="""
# Hello World

This is an example Markdown text.

## Subheadline 

This is a subheadline.

| Column 1 | Column 2 |
|----------|----------|
| Cell 1   | Cell 2   |
| Cell 3   | Cell 4   |

![ figure](Here are some figures)

"""

saveText(t, "helloworld")


Pure total words:  10
Total time to read [minutes] :  0.08
headline_words 2
subheadline_words 1
table_words 22
figures_words 5


In [114]:
print(global_markdown)

{'# Hello World': '\nThis is an example Markdown text.\n', '## Subheadline ': '\nThis is a subheadline.\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Cell 1   | Cell 2   |\n| Cell 3   | Cell 4   |\n\n![ figure](Here are some figures)\n\n'}


In [115]:
# Test 2 

In [132]:
t=""" 

# This one is testing escaping sequences:

## asdf

\big 
\tag
\newline
\\newline
\equation
\text
\autocite

\begin{equation}
E(w,b) = \frac{1}{n}\sum_{i=1}^{n} L(y_i, f(x_i)) + \alpha R(w)
\end{equation}

"""
saveText(t, 'test2')


Pure total words:  1
Total time to read [minutes] :  0.01
headline_words 7
subheadline_words 2
table_words 0
figures_words 0


In [160]:
def dict_markdown_to_str(global_markdown: dict) -> str:
    markdown_string = ""
    for key, value in global_markdown.items():
        markdown_string += key + value
    return markdown_string

In [153]:
convert_to_markdown(global_markdown)

'# Hello World\nThis is an example Markdown text.\n## Subheadline \nThis is a subheadline.\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Cell 1   | Cell 2   |\n| Cell 3   | Cell 4   |\n\n![ figure](Here are some figures)\n\n'

In [117]:
def count_words(words):
    count = 0
    words=add_extra_backslash_to_escapes_codes(words)
    
    #words=words
    for word in words.split():
        if not re.match("\\\\", word):
            count += 1
    return count

In [145]:
line="""  
# This one is testing escaping sequences:

\big 
\tag
\newline
\\newline
\equation
\text
\autocite


\begin{equation}
E(w,b) = \frac{1}{n}\sum_{i=1}^{n} L(y_i, f(x_i)) + \alpha R(w)
\end{equation} """

count_words(line)


# Output: [ 'two', one']

14

In [119]:
# Add wordcount by each # level 1 headline

In [120]:
markdown=split_markdown_into_dict_level_one(global_markdown)

In [121]:
print(add_wordcount_to_markdown(markdown))

# Hello World

\begin{flushright}
        10 words, around 0.08 minutes of reading 
\end{flushright}
    
This is an example Markdown text.

## Subheadline 

This is a subheadline.

| Column 1 | Column 2 |
|----------|----------|
| Cell 1   | Cell 2   |
| Cell 3   | Cell 4   |

![ figure](Here are some figures)


# This one is testing escaping sequences:

\begin{flushright}
        7 words, around 0.05 minutes of reading 
\end{flushright}
    
\big 
\tag

ewline
\newline
\equation
\text
\autocite

\begin{equation}
E(w,b) = \frac{1}{n}\sum_{i=1}^{n} L(y_i, f(x_i)) + \alpha R(w)
\end{equation}





In [147]:
global_markdown

{'# Hello World': '\nThis is an example Markdown text.\n',
 '## Subheadline ': '\nThis is a subheadline.\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Cell 1   | Cell 2   |\n| Cell 3   | Cell 4   |\n\n![ figure](Here are some figures)\n\n'}

In [146]:
count_markdown_words_dict(global_markdown)

{'pure_total_words': 10,
 'headline_words': -2,
 'subheadline_words': -2,
 'table_words': 22,
 'figures_words': 5}

In [123]:
# Test 3

In [124]:
t=""" 
# Hello World asd

This paragraph have exactly 6 words

## Subheadline 

This is a subheadline with the table that we do not count so
this paragraph have 18 words

| Column 1 | Column 2 |
|----------|----------|
| Cell 1   | Cell 2   |
| Cell 3   | Cell 4   |

## subhedline 2

6 words only in this headline

# Headline 2

here is another text with the picture included that should not be counted 15 words

![ alt text ]( )

"""

saveText(t, "hello_world2")


Pure total words:  45
Total time to read [minutes] :  0.35
headline_words 7
subheadline_words 5
table_words 22
figures_words 5


In [125]:
count_markdown_words(t)

{'pure_total_words': 45,
 'headline_words': 7,
 'subheadline_words': 5,
 'table_words': 22,
 'figures_words': 5}

In [16]:
markdown=split_markdown_into_dict(t)

In [17]:
markdown

{'# Hello World asd': '\nThis paragraph have exactly 6 words\n',
 '## Subheadline ': '\nThis is a subheadline with the table that we do not count so\nthis paragraph have 18 words\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Cell 1   | Cell 2   |\n| Cell 3   | Cell 4   |\n',
 '## subhedline 2': '\n6 words only in this headline\n',
 '# Headline 2': '\nhere is another text with the picture included that should not be counted 15 words\n\n![ alt text ]( )\n\n'}

In [18]:
for key, values in global_markdown.items():
    print(values)


This is an example Markdown text.


This is a subheadline with the table that we do not count so
this paragraph have 18 words

| Column 1 | Column 2 |
|----------|----------|
| Cell 1   | Cell 2   |
| Cell 3   | Cell 4   |


\big 
\tag

ewline
\newline
\equation
\text
\autocite

\begin{equation}
E(w,b) = \frac{1}{n}\sum_{i=1}^{n} L(y_i, f(x_i)) + \alpha R(w)
\end{equation}



This paragraph have exactly 6 words


6 words only in this headline


here is another text with the picture included that should not be counted 15 words

![ alt text ]( )




In [19]:
#ADD WORD COUNT FOR EACH PARAGRAPH

print(add_wordcount_to_markdown(global_markdown))

# Hello World

\begin{flushright}
        6 words, around 0.05 minutes of reading 
\end{flushright}
    
This is an example Markdown text.

## Subheadline 

\begin{flushright}
        18 words, around 0.14 minutes of reading 
\end{flushright}
    
This is a subheadline with the table that we do not count so
this paragraph have 18 words

| Column 1 | Column 2 |
|----------|----------|
| Cell 1   | Cell 2   |
| Cell 3   | Cell 4   |

# This one is testing escaping sequences:

\begin{flushright}
        28 words, around 0.22 minutes of reading 
\end{flushright}
    
\big 
\tag

ewline
\newline
\equation
\text
\autocite

\begin{equation}
E(w,b) = \frac{1}{n}\sum_{i=1}^{n} L(y_i, f(x_i)) + \alpha R(w)
\end{equation}


# Hello World asd

\begin{flushright}
        6 words, around 0.05 minutes of reading 
\end{flushright}
    
This paragraph have exactly 6 words

## subhedline 2

\begin{flushright}
        6 words, around 0.05 minutes of reading 
\end{flushright}
    
6 words only in this hea