In [11]:
import re
import random
import pandas as pd
import hashlib

In [12]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


def remake_card_document(existing_cards: pd.DataFrame, filename: str='certcards2.txt'):
    with open(filename, mode='w', encoding='utf8') as f:
        i = 1
        for _, row in existing_cards.iterrows():
            print(i)
            f.write('\n'*6)
            f.write(str(i)+'\n')  
            f.write(row['head']+'\n')
            f.write(row['body'])
            i+=1
            # print(F"{row['head']}: {row['age']:.4f}")


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)

cards_to_age['head'] = cards_to_age['head'].str.strip()

cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.05
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

# cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)

existing_cards = cards_to_age

In [13]:
existing_cards.head(20)

Unnamed: 0,head,body,hash,age
0,MS Data Analyst,To create a mobile-optimized version of your r...,4d8fd602d8bfa654c315ac269f52d86a,29.975348
1,MS Data Analyst,"To learn how to run a network port test, see A...",22998b26acb3875cbf06fea095224d99,12.125328
2,OAuth2.0,Access tokens are short lived. Refresh them af...,4f6d811c33c9f4c0a5649a91ec857826,32.021287
3,OAuth2.0,Use the auth code flow paired with Proof Key f...,846f1057be1d337dd6100431f86d5514,24.414151
4,OAuth2.0,To learn who the user is before redeeming an a...,2c5d7b6815993b5298c2f0839f1a0ab6,22.358803
5,Power Platform,UI Component: View\nViews define how a list of...,28bc71b2be4ea10648297b445ba49b56,29.042883
6,Power Platform,UI Component: Custom page (preview)\n\tA canva...,6c7c93d8ebef2cda39d6918d72d18fe1,28.596746
7,Power Platform,Types of Logic Components:\n Business process...,23482d3e7b7c9454ffbf4365ed47fbbc,28.111373
8,RAG,One prevalent challenge when implementing lang...,40001516b0643b57f49ff5baf453b4b1,28.03978
9,RAG,You can use Azure AI Studio to build a custom ...,c3315222d20603b0d72145e7cebfc596,27.834755


# Completely Random Shuffle

In [14]:
# rows, cols = existing_cards.shape

# existing_cards = existing_cards.sample(frac=1)
# remake_card_document(filename='certcards2.txt', existing_cards = existing_cards)



# Age Shuffle

In [15]:
# existing_cards = existing_cards.sort_values('age', ascending=False)
# remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

In [16]:
existing_cards[['head', 'age']].head(n = 10)

Unnamed: 0,head,age
0,MS Data Analyst,29.975348
1,MS Data Analyst,12.125328
2,OAuth2.0,32.021287
3,OAuth2.0,24.414151
4,OAuth2.0,22.358803
5,Power Platform,29.042883
6,Power Platform,28.596746
7,Power Platform,28.111373
8,RAG,28.03978
9,RAG,27.834755


# Headers with fewest notes first

In [17]:
frequency = existing_cards['head'].value_counts(ascending=True)
print(frequency)

existing_cards = pd.merge(
    left=existing_cards,
    right=frequency.rename('frequency'),
    left_on='head', right_index=True,
    how='left'
)
existing_cards.sort_values(['frequency', 'head'], ascending=True, inplace=True)

remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

MS Data Analyst                    2
OAuth2.0                           3
Power Platform                     3
RAG                                3
numpy                              3
Azure Functions Quickstart         4
Developer Mode                     4
Kaggle                             4
OData Requests                     5
Azure Storage                      5
Azure OpenAI                       6
Azure VDI Project                  6
Dataverse                          6
Diffusers Documentation            6
Git                                6
Workera.ai                         7
General                            7
OData                              7
Conditional Access                 7
Dataverse Plugins                  7
AKS                                8
Dataverse Queries                  8
pandas                             8
Diffusers from Hugging Face       10
Jane Eyre                         10
PP365                             10
Kali Linux                        11
A

In [18]:
existing_cards['head'].value_counts()

DNS                               37
Python                            34
Azure Kubernetes Learning Path    34
Diffusers Library                 30
Search Engine Optimization        29
sklearn                           21
Azure Functions                   19
MS Identity Platform              13
Power BI                          12
Kali Linux                        11
Azure AI Search                   11
PP365                             10
Jane Eyre                         10
Diffusers from Hugging Face       10
pandas                             8
Dataverse Queries                  8
AKS                                8
Workera.ai                         7
OData                              7
Dataverse Plugins                  7
Conditional Access                 7
General                            7
Git                                6
Diffusers Documentation            6
Dataverse                          6
Azure VDI Project                  6
Azure OpenAI                       6
O

# Focus on one header

In [19]:
# heads = existing_cards['head'].value_counts()
# heads = heads[heads > 5].index.tolist()
# one_header = random.sample(heads, 1)[0]
# existing_cards['pick_head'] = existing_cards['head'].apply(
#     lambda h: 0 if h == one_header else 1 
# )

# remake_card_document(existing_cards=existing_cards.sort_values('pick_head'))

In [20]:
833e3/54.33/100


153.32228971102523