In [8]:
import re
import random
import pandas as pd
import hashlib

In [9]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


def remake_card_document(existing_cards: pd.DataFrame, filename: str='certcards2.txt'):
    with open(filename, mode='w', encoding='utf8') as f:
        i = 1
        for _, row in existing_cards.iterrows():
            print(i)
            f.write('\n'*6)
            f.write(str(i)+'\n')  
            f.write(row['head']+'\n')
            f.write(row['body'])
            i+=1
            # print(F"{row['head']}: {row['age']:.4f}")


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)

cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.1
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)

existing_cards = cards_to_age

In [15]:
existing_cards.head(20)

Unnamed: 0,head,body,hash,age,frequency
0,Power BI Desktop,"Storage mode: With storage mode, you can now s...",c932079940e7ddbd2d7efd59d0d80a9b,12.329861,2
1,Power BI Desktop,The Dual storage mode is a performance optimiz...,553f54e5508c3acca602c0c13b5c0764,11.182125,2
2,Power Platform administration,You use the Microsoft 365 admin center to crea...,a410743b2503502074099ec64d42df94,14.712707,2
3,Power Platform administration,When you create a user and assign a license in...,2b727e0910a39a004deac65ef7073e47,11.838533,2
4,Site Scraping,I had to use the os.path abspath function to g...,15bc37738621b4881dfed6c5ea5135b1,17.151208,2
5,Site Scraping,Use Selenium to navigate to desirable data - t...,c4cf03455db0320a1a29385027d9895e,11.129518,2
6,MS Data Analyst,At report design time in Microsoft Power BI De...,4818085ea2a0800c7cbb1f92fe3e55d6,16.048021,3
7,MS Data Analyst,To create a mobile-optimized version of your r...,4d8fd602d8bfa654c315ac269f52d86a,15.195872,3
8,MS Data Analyst,"To learn how to run a network port test, see A...",22998b26acb3875cbf06fea095224d99,5.198237,3
9,Maths,Compositional data refers to vectors of positi...,2e743191dcafd6d466b6b1fc07e02796,16.308553,3


# Completely Random Shuffle

In [11]:
# rows, cols = existing_cards.shape

# existing_cards = existing_cards.sample(frac=1)
# remake_card_document(filename='certcards2.txt', existing_cards = existing_cards)



# Age Shuffle

In [12]:
# remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

# Headers with fewest notes first

In [13]:
frequency = existing_cards['head'].value_counts(ascending=True)
print(frequency)

existing_cards = pd.merge(
    left=existing_cards,
    right=frequency.rename('frequency'),
    left_on='head', right_index=True,
    how='left'
)
existing_cards.sort_values(['frequency', 'head'], ascending=True, inplace=True)

remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

Power BI Desktop                   2
Site Scraping                      2
Power Platform administration      2
OAuth2.0                           3
MS Data Analyst                    3
Maths                              3
Power Platform                     3
RAG                                3
Azure Functions Quickstart         4
numpy                              4
Developer Mode                     4
Kaggle                             4
Azure Storage                      5
OData Requests                     5
Azure OpenAI                       5
UHero Requests                     5
Dataverse                          6
Workera.ai                         6
Azure VDI Project                  6
Diffusers Documentation            6
OData                              6
Git                                6
Conditional Access                 7
Dataverse Queries                  7
Dataverse Plugins                  7
General                            8
pandas                             8
A

In [14]:
existing_cards['head'].value_counts()

Python                            35
Diffusers Library                 31
Search Engine Optimization        29
Azure Kubernetes Learning Path    19
Power BI                          14
MS Identity Platform              13
sklearn                           12
DNS                               12
Azure AI Search                   11
Kali Linux                        11
sklearn                           11
PP365                             10
Diffusers from Hugging Face       10
Azure Functions                   10
Azure Functions                    9
pandas                             8
General                            8
AKS                                8
Conditional Access                 7
Dataverse Queries                  7
Dataverse Plugins                  7
Workera.ai                         6
Git                                6
Diffusers Documentation            6
Dataverse                          6
Azure VDI Project                  6
OData                              6
A