In [1]:
import re
import random
import pandas as pd
import hashlib

In [2]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


def remake_card_document(existing_cards: pd.DataFrame, filename: str='certcards2.txt'):
    with open(filename, mode='w', encoding='utf8') as f:
        i = 1
        for _, row in existing_cards.iterrows():
            print(i)
            f.write('\n'*6)
            f.write(str(i)+'\n')  
            f.write(row['head']+'\n')
            f.write(row['body'])
            i+=1
            # print(F"{row['head']}: {row['age']:.4f}")


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)

cards_to_age['head'] = cards_to_age['head'].str.strip()

cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.05
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

# cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)

existing_cards = cards_to_age

In [3]:
existing_cards.head(20)

Unnamed: 0,head,body,hash,age
0,Diffusers Library,Gaussian Process\nA probabilistic model for de...,12a87b029b5bb8ac8951e6cf3ffd144d,31.50623
1,DNS,• DNS Rebinding:\nA technique used by attacker...,e42095222ce6146ad127a28cd3470e65,30.753693
2,Python,The first bit of information that you can gath...,7da14bed0fb71ea27640010a678ff008,30.850676
3,Diffusers Documentation,"Sometimes, old environment paths can linger in...",b7cd2d381a70974dd5c4efb35540e8b5,30.835544
4,Diffusers Library,Bayesian Inference\nThe process of updating pr...,efdb373cb8627f97e3f95fb18eddb4a6,30.319806
5,Diffusers Documentation,"On Windows, pip relies on a small launcher exe...",72502694afdcd59ba18edb412613112c,30.498373
6,OAuth2.0,Access tokens are short lived. Refresh them af...,4f6d811c33c9f4c0a5649a91ec857826,30.261094
7,MS Identity Platform,The redirect URIs to use in a desktop applicat...,eed9f81cb9d69a3dce61a716e3fa3cf9,30.704449
8,Dataverse,Confidential Client:\n\nDefinition: An applica...,caf1d454763ac691ca4a2493885c9659,30.982542
9,Python,The purpose of the .__init__() method in a Pyt...,dfeadf2dc34f5c52473326752587fcd4,30.68009


# Completely Random Shuffle

In [4]:
# rows, cols = existing_cards.shape

# existing_cards = existing_cards.sample(frac=1)
# remake_card_document(filename='certcards2.txt', existing_cards = existing_cards)



# Age Shuffle

In [5]:
# existing_cards = existing_cards.sort_values('age', ascending=False)
# remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

In [6]:
existing_cards[['head', 'age']].head(n = 10)

Unnamed: 0,head,age
0,Diffusers Library,31.50623
1,DNS,30.753693
2,Python,30.850676
3,Diffusers Documentation,30.835544
4,Diffusers Library,30.319806
5,Diffusers Documentation,30.498373
6,OAuth2.0,30.261094
7,MS Identity Platform,30.704449
8,Dataverse,30.982542
9,Python,30.68009


# Headers with fewest notes first

In [7]:
frequency = existing_cards['head'].value_counts(ascending=True)
print(frequency)

existing_cards = pd.merge(
    left=existing_cards,
    right=frequency.rename('frequency'),
    left_on='head', right_index=True,
    how='left'
)
existing_cards.sort_values(['frequency', 'head'], ascending=True, inplace=True)

remake_card_document(filename='certcards2.txt', existing_cards=existing_cards)

MS Data Analyst                    2
OAuth2.0                           3
numpy                              3
RAG                                3
Power Platform                     3
Developer Mode                     4
Azure Functions Quickstart         4
Kaggle                             4
OData Requests                     5
Azure Storage                      5
Azure OpenAI                       6
Git                                6
Azure VDI Project                  6
Dataverse                          6
Diffusers Documentation            6
Workera.ai                         7
General                            7
Dataverse Plugins                  7
Conditional Access                 7
OData                              7
Dataverse Queries                  8
pandas                             8
AKS                                8
Diffusers from Hugging Face       10
Jane Eyre                         10
PP365                             10
Azure AI Search                   11
K

In [8]:
existing_cards['head'].value_counts()

Python                            34
Azure Kubernetes Learning Path    34
Diffusers Library                 30
Search Engine Optimization        29
sklearn                           21
Azure Functions                   19
MS Identity Platform              13
Power BI                          12
DNS                               12
Kali Linux                        11
Azure AI Search                   11
PP365                             10
Jane Eyre                         10
Diffusers from Hugging Face       10
pandas                             8
Dataverse Queries                  8
AKS                                8
Workera.ai                         7
OData                              7
Dataverse Plugins                  7
Conditional Access                 7
General                            7
Git                                6
Diffusers Documentation            6
Dataverse                          6
Azure VDI Project                  6
Azure OpenAI                       6
O

# Focus on one header

In [9]:
# heads = existing_cards['head'].value_counts()
# heads = heads[heads > 5].index.tolist()
# one_header = random.sample(heads, 1)[0]
# existing_cards['pick_head'] = existing_cards['head'].apply(
#     lambda h: 0 if h == one_header else 1 
# )

# remake_card_document(existing_cards=existing_cards.sort_values('pick_head'))

In [10]:
833e3/54.33/100


153.32228971102523