In [1]:
import re
import random
import pandas as pd
import hashlib

In [None]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


def remake_card_document(existing_cards: pd.DataFrame, filename: str='certcards2.txt'):
    with open(filename, mode='w', encoding='utf8') as f:
        i = 1
        for _, row in existing_cards.iterrows():
            print(i)
            f.write('\n'*6)
            f.write(str(i)+'\n')  
            f.write(row['head']+'\n')
            f.write(row['body'])
            i+=1
            # print(F"{row['head']}: {row['age']:.4f}")


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)

cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.1
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)

existing_cards = cards_to_age

In [3]:
existing_cards

Unnamed: 0,head,body,hash,age
4,General,Hotpatching: A feature that allows for install...,7680fe9397828db0362832720191b16b,15.485871
5,Power BI,Transforming API Data into Usable Tables: API ...,ba8e3f6648e157e77bf5317f376e3052,15.457587
2,Azure AI Search,Binary Quantization: Binary Quantization conve...,fba7139fd331ebbf61b0c7edd9ded0be,15.388807
0,Site Scraping,I had to use the os.path abspath function to g...,15bc37738621b4881dfed6c5ea5135b1,15.321921
3,Python,The first bit of information that you can gath...,7da14bed0fb71ea27640010a678ff008,15.136930
...,...,...,...,...
358,Search Engine Optimization,Link equity refers to the value or authority p...,ce2ca16db51c1a8517bb9479cb67c9ff,0.089593
361,Search Engine Optimization,A meta description is an HTML attribute that p...,baf9e83e655c74696d5cb936f13888d5,0.076090
356,Search Engine Optimization,Google Search Console is a free service that h...,1667ce091c210a0f94e3c4e544335be2,0.067037
347,Search Engine Optimization,A backlink is an incoming hyperlink from one w...,fb4dd01097043043ba9450188b8830f9,0.041661


# Completely Random Shuffle

In [4]:
# rows, cols = existing_cards.shape

# existing_cards = existing_cards.sample(frac=1)
# remake_card_document(filename='certcards2.txt', existing_cards = existing_cards)



# Age Shuffle

In [5]:
# remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

# Headers with fewest notes first

In [6]:
frequency = existing_cards['head'].value_counts(ascending=True)
print(frequency)

existing_cards = pd.merge(
    left=existing_cards,
    right=frequency.rename('frequency'),
    left_on='head', right_index=True,
    how='left'
)
existing_cards.sort_values(['frequency', 'head'], ascending=True, inplace=True)

remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

Site Scraping                      2
Power BI Desktop                   2
Power Platform administration      2
MS Data Analyst                    3
Power Platform                     3
RAG                                3
Maths                              3
OAuth2.0                           3
Azure Functions Quickstart         4
Kaggle                             4
numpy                              4
Developer Mode                     4
OData Requests                     5
Azure OpenAI                       5
Azure Storage                      5
UHero Requests                     5
Diffusers Documentation            6
Dataverse                          6
OData                              6
Git                                6
Workera.ai                         6
Azure VDI Project                  6
Conditional Access                 7
Dataverse Plugins                  7
Dataverse Queries                  7
AKS                                8
General                            8
p

In [7]:
existing_cards['head'].value_counts()

Python                            35
Diffusers Library                 31
Search Engine Optimization        29
Azure Kubernetes Learning Path    19
Power BI                          14
MS Identity Platform              13
sklearn                           12
DNS                               12
Azure AI Search                   11
Kali Linux                        11
sklearn                           11
PP365                             10
Diffusers from Hugging Face       10
Azure Functions                   10
Azure Functions                    9
pandas                             8
General                            8
AKS                                8
Conditional Access                 7
Dataverse Queries                  7
Dataverse Plugins                  7
Workera.ai                         6
Git                                6
Diffusers Documentation            6
Dataverse                          6
Azure VDI Project                  6
OData                              6
A