In [33]:
import re
import random
import pandas as pd
import hashlib

In [34]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

Unnamed: 0,head,body,hash
0,General,Automatic VM guest patching: This refers to th...,a1fe67b1c263c4bb2a93f691cd50f2a6
1,Diffusers from Hugging Face,"Device Management: In torch, models and data a...",5ca009995498e7262b4ab82055bd9ab6
2,sklearn,"When the cv argument is an integer, cross_val_...",ef8c26a032a9ced1d247ce1296e8fd91
3,MS Identity Platform,Integrated Windows Authentication has been rep...,9922618093b39a61b5ef1af3bf16c6f7
4,RAG,Look up relevant information\nThis is a node i...,c0716255f34803ba3545082187754f2d
...,...,...,...
376,Azure OpenAI,Assistant \nCustom AI that uses Azure OpenAI m...,200c2e63ca07fa3b2a862ce2b0dfd85f
377,MS Identity Platform,URIs for certain apps:\n○ Apps that use embedd...,af75a6029c7b258a77de1b930aed60fc
378,RAG,One prevalent challenge when implementing lang...,40001516b0643b57f49ff5baf453b4b1
379,Azure,I figured out how to export the sign-in logs! ...,74732f5906f5d4d4760b1f970ab6978a


# Completely Random Shuffle

In [35]:
# random.shuffle(cards)

# with open('certcards2.txt', mode='w', encoding='utf8') as f:
#     i = 1
#     for subject, content in cards:
#         f.write('\n'*5)
#         f.write(str(i)+'\n')  
#         f.write(subject+'\n')
#         f.write(content)
#         i+=1

# print(f"{i-1} cards found")

# Card Age Shuffle

In [None]:
card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)



cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.1
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)


with open('certcards2.txt', mode='w', encoding='utf8') as f:
    cards_to_age.reset_index(drop=True)
    i = 1
    for _, row in cards_to_age.iterrows():
        print(i)
        f.write('\n'*5)
        f.write(str(i)+'\n')  
        f.write(row['head']+'\n')
        f.write(row['body'])
        i+=1
        print(F"{row['head']}: {row['age']:.4f}")

# cards_to_age

1
OAuth2.0: 4.9370
2
MS Identity Platform: 4.6359
3
Dataverse Queries: 4.5981
4
Power BI: 4.5392
5
Conditional Access: 4.5386
6
Developer Mode: 4.5310
7
Python: 4.5219
8
Diffusers Library: 4.4560
9
sklearn: 4.4225
10
HuggingFace diffusers: 4.3269
11
HuggingFace diffusers: 4.3266
12
Azure Functions: 4.3050
13
Python: 4.2819
14
numpy: 4.2571
15
sklearn : 4.2201
16
Diffusers Documentation: 4.2154
17
OData: 4.2108
18
OData Requests: 4.2085
19
Diffusers Library: 4.2005
20
Kaggle: 4.1683
21
OData: 4.1624
22
Azure Functions Quickstart: 4.1593
23
OData: 4.1401
24
Python: 4.1117
25
Maths: 4.0989
26
OData: 4.0940
27
MS Data Analyst: 4.0938
28
Python: 4.0429
29
Diffusers Library: 4.0386
30
Azure VDI Project: 3.9971
31
Power BI: 3.9945
32
Conditional Access: 3.9928
33
DNS: 3.9884
34
Azure Storage: 3.9845
35
sklearn: 3.9777
36
Azure OpenAI: 3.9499
37
Power BI: 3.9483
38
sklearn: 3.9365
39
OData Requests: 3.9308
40
Microsoft Fabric Administration: 3.9282
41
Python: 3.9225
42
Power BI: 3.9165
43
MS I

Unnamed: 0,head,body,hash,age
372,OAuth2.0,The spa redirect type is backward-compatible w...,69fbc425d5885cde3d5246de2408254a,4.937028
3,MS Identity Platform,Integrated Windows Authentication has been rep...,9922618093b39a61b5ef1af3bf16c6f7,4.635913
175,Dataverse Queries,Filter Query\nDefinition: A part of the OData ...,e1a4ee17440036cabd5a7c81470d76ce,4.598052
262,Power BI,To define a column in Power BI (DAX) that take...,c50873f880f0ab19026687c2b3dda9c8,4.539151
338,Conditional Access,We need Microsoft Entra P1 or P2 licenses for ...,316b73e1880905271087dbfff8c46f23,4.538630
...,...,...,...,...
222,Kali Linux,Hydra is a popular password-cracking tool in K...,5f02d3fa69b0322224237963ff54e626,0.270973
156,Diffusers from Hugging Face,Tokenizer (CLIPTokenizer): Provided by transfo...,bc4522b67d6baa044af56334d5e45d73,0.184079
36,Diffusers from Hugging Face,Scheduler (UniPCMultistepScheduler): A diffuse...,714e60b34b72275b331900cb0bb06083,0.145733
356,Kali Linux,apt (Advanced Package Tool) is the package man...,42875da42f51438ac261700421f0b85b,0.139570
