In [14]:
import re
import random
import pandas as pd
import hashlib

In [15]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


def remake_card_document(existing_cards: pd.DataFrame, filename: str='certcards2.txt'):
    with open(filename, mode='w', encoding='utf8') as f:
        i = 1
        for _, row in existing_cards.iterrows():
            print(i)
            f.write('\n'*5)
            f.write(str(i)+'\n')  
            f.write(row['head']+'\n')
            f.write(row['body'])
            i+=1
            # print(F"{row['head']}: {row['age']:.4f}")


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

Unnamed: 0,head,body,hash
0,Power BI,API Request Configuration in Power BI: When in...,daaba655c66451f525495d600524a4e7
1,sklearn,The k-means algorithm divides a set of N sampl...,a8317c40061ece734990d7b60a96683e
2,MS Data Analyst,At report design time in Microsoft Power BI De...,4818085ea2a0800c7cbb1f92fe3e55d6
3,Diffusers Library,There are three main components of the library...,f53be0b3e5aa87711e1e0d298df56461
4,Azure AI Search,Scalar Quantization: Scalar Quantization reduc...,fdd381f92069b85aa00c179ec7dbf3d5
...,...,...,...
377,Azure Kubernetes Learning Path,We use the docker build command to build Docke...,5020dc832013d992a587a48328a8af7f
378,Azure Kubernetes Learning Path,A single image can have multiple tags assigned...,def5c4b3e13a222dd1a600d23b18d739
379,Azure Kubernetes Learning Path,The Docker software automatically configures a...,91566b39245a8c703c0531ac23d3baf9
380,Azure Kubernetes Learning Path,You can remove an image from the local docker ...,cbb0e85d81e0481135b30b265960a141


# Completely Random Shuffle

In [16]:
# rows, cols = existing_cards.shape

# existing_cards = existing_cards.sample(frac=1)
# remake_card_document(filename='certcards2.txt')

# # print(f"{i-1} cards found")

# Age Shuffle

In [17]:
# card_ages = pd.read_json('card_ages.json')
# # found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# # found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

# cards_to_age = pd.merge(
#     left=existing_cards,
#     right=card_ages[['hash', 'age']],
#     left_on='hash', right_on='hash',
#     how='left'
# )



# cards_to_age['age'] = cards_to_age['age'].fillna(0)
# cards_to_age['age'] = cards_to_age['age'] * 1.1
# cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

# cards_to_age = cards_to_age.sort_values('age', ascending=False)
# cards_to_age.drop_duplicates(subset=['hash'], keep='first')
# cards_to_age.to_json('card_ages.json', indent=2)


# with open('certcards2.txt', mode='w', encoding='utf8') as f:
#     i = 1
#     for _, row in cards_to_age.iterrows():
#         print(i)
#         f.write('\n'*5)
#         f.write(str(i)+'\n')  
#         f.write(row['head']+'\n')
#         f.write(row['body'])
#         i+=1
#         print(F"{row['head']}: {row['age']:.4f}")


# Headers with fewest notes first

In [18]:
frequency = existing_cards['head'].value_counts(ascending=True)
print(frequency)

existing_cards = pd.merge(
    left=existing_cards,
    right=frequency.rename('frequency'),
    left_on='head', right_index=True,
    how='left'
)
existing_cards.sort_values('frequency', ascending=True, inplace=True)

remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

Power Apps                         1
Tax stuff                          1
DAX Studio                         1
Child Services AWS Review          1
MS Dataverse                       1
Terraform                          1
Windows                            1
Azure VM Image Builder             1
Power BI Desktop                   2
GitHub codespaces                  2
Power Platform administration      2
Site Scraping                      2
OAuth2.0                           3
Power Platform                     3
RAG                                3
Maths                              3
HuggingFace diffusers              3
numpy                              4
Developer Mode                     4
MS Data Analyst                    4
Kaggle                             4
Azure Functions Quickstart         5
OData Requests                     5
Azure OpenAI                       6
Dataverse                          6
Diffusers Documentation            6
UHero Requests                     6
A

In [19]:
existing_cards

Unnamed: 0,head,body,hash,frequency
250,Azure VM Image Builder,Azure Resource Providers are essentially servi...,550568a4f64d549466b8a21610b07580,1
211,Power Apps,"Power Platform services are built on Azure, Mi...",d90f4516a480b9ddf9d11b3eebe24f3c,1
155,Terraform,"The graph output uses the DOT language, which ...",d473df111c543108aa3d0c12a4d332a1,1
70,Tax stuff,"""TA-2"" refers to the Transient Accommodations ...",7367393b770c9b331cfdefe3c30f01c5,1
63,Windows,You can increase or decrease the size of icons...,3a0af754d5909a32b6633c9407900dad,1
...,...,...,...,...
177,Python,Sessions are used to persist parameters across...,b0159957e08e2a7a5dfa8507183ef3dc,39
23,Python,The first bit of information that you can gath...,7da14bed0fb71ea27640010a678ff008,39
179,Python,When you try to create an instance of a class ...,4b35a67a5e246065cd3e4e94a3433734,39
35,Python,Python sequences have three common characteris...,a33c75ddca92427de4659e0be21f3a45,39
