In [2]:
import re
import random
import pandas as pd
import hashlib

In [3]:
with open('certcards2.txt', mode='r', encoding='utf8') as f:
    all_cards = f.read()
    
card_split_pattern = r'\n\n\n\d+\n'
all_cards = re.split(card_split_pattern, all_cards)
# Use re.DOTALL to allow . to match newline characters
card_pattern = re.compile(r'(.+?)\n([\s\S]+)', re.DOTALL)
cards = [(match.group(1), match.group(2)) for cstring in all_cards if (match := re.search(card_pattern, cstring))]

# removing the cards that have no content and trimming
cards = [(subject, stripped_content) for subject, content in cards if len(stripped_content := content.strip()) > 5]

def hash_string_md5(s):
    """
    Hashes a string using MD5 and returns a truncated hash for efficiency.

    Parameters:
    - s (str): The input string to hash.

    Returns:
    - str: The truncated hexadecimal hash string.
    """
    if pd.isnull(s):
        return None  # Handle NaN values gracefully
    return hashlib.md5(s.encode('utf-8')).hexdigest()  # Truncate to first 12 characters


existing_cards = pd.DataFrame(cards, columns=['head', 'body'])


# existing_cards['age'] = [random.random() for _ in existing_cards.index]
existing_cards['hash'] = existing_cards['body'].apply(hash_string_md5)
existing_cards

Unnamed: 0,head,body,hash
0,HuggingFace diffusers,safetensors is supported by the Hugging Face t...,5be88034bc059e296084cff4e3548785
1,Azure Functions,A trigger is an object that defines a specific...,6b69f6222e4884627bbe26cd3b565b09
2,Python,To make a request to GitHub’s authenticated us...,204d3082b40e52d21a4a9d4f425ec38c
3,numpy,"numpy.where(condition, [x, y, ]/)\n\nFirst of ...",113c2d770f43445509725df0d78b7244
4,sklearn,LeavePGroupsOut: This cross-validation techniq...,770330bed7601963d7b9c3ce686879a5
...,...,...,...
366,Kali Linux,Hydra is a popular password-cracking tool in K...,5f02d3fa69b0322224237963ff54e626
367,Diffusers from Hugging Face,Tokenizer (CLIPTokenizer): Provided by transfo...,bc4522b67d6baa044af56334d5e45d73
368,Diffusers from Hugging Face,Scheduler (UniPCMultistepScheduler): A diffuse...,714e60b34b72275b331900cb0bb06083
369,Kali Linux,apt (Advanced Package Tool) is the package man...,42875da42f51438ac261700421f0b85b


# Completely Random Shuffle

In [4]:
# random.shuffle(cards)

# with open('certcards2.txt', mode='w', encoding='utf8') as f:
#     i = 1
#     for subject, content in cards:
#         f.write('\n'*5)
#         f.write(str(i)+'\n')  
#         f.write(subject+'\n')
#         f.write(content)
#         i+=1

# print(f"{i-1} cards found")

# Card Age Shuffle

In [5]:
card_ages = pd.read_json('card_ages.json')
# found_cards = pd.DataFrame(cards, columns=['head', 'body'])
# found_cards['hash'] = found_cards['body'].apply(hash_string_md5)

cards_to_age = pd.merge(
    left=existing_cards,
    right=card_ages[['hash', 'age']],
    left_on='hash', right_on='hash',
    how='left'
)



cards_to_age['age'] = cards_to_age['age'].fillna(0)
cards_to_age['age'] = cards_to_age['age'] * 1.1
cards_to_age['age'] = cards_to_age['age'] + [random.random() for _ in cards_to_age.index]

cards_to_age = cards_to_age.sort_values('age', ascending=False)
cards_to_age.drop_duplicates(subset=['hash'], keep='first')
cards_to_age.to_json('card_ages.json', indent=2)


with open('certcards2.txt', mode='w', encoding='utf8') as f:
    cards_to_age.reset_index(drop=True)
    i = 1
    for _, row in cards_to_age.iterrows():
        print(i)
        f.write('\n'*5)
        f.write(str(i)+'\n')  
        f.write(row['head']+'\n')
        f.write(row['body'])
        i+=1
        print(F"{row['head']}: {row['age']:.4f}")

# cards_to_age

1
OData: 5.5503
2
Azure Functions: 5.4718
3
OData: 5.4375
4
OData: 5.4249
5
Diffusers Library: 5.4074
6
Python: 5.4057
7
Power BI: 5.3223
8
numpy: 5.3205
9
Python: 5.3144
10
OData Requests: 5.3078
11
Microsoft Fabric Administration: 5.3064
12
Power BI: 5.2903
13
OData Requests: 5.2779
14
Diffusers Library: 5.2678
15
HuggingFace diffusers: 5.2546
16
Diffusers Library: 5.2139
17
sklearn: 5.1922
18
Azure AI Search: 5.1765
19
numpy: 5.0726
20
Conditional Access: 5.0515
21
UHero Requests: 5.0449
22
MS Data Analyst: 5.0205
23
Maths: 5.0148
24
DNS: 4.9854
25
Python: 4.9808
26
Dataverse Queries: 4.9575
27
sklearn : 4.9539
28
Conditional Access: 4.9461
29
Python: 4.9339
30
Kaggle: 4.9238
31
Conditional Access: 4.9104
32
sklearn : 4.9022
33
Azure Functions Quickstart: 4.8979
34
pandas: 4.8883
35
General: 4.8835
36
OData: 4.8702
37
pandas: 4.8453
38
sklearn: 4.8400
39
Azure VDI Project: 4.8187
40
PP365: 4.7928
41
Diffusers Library: 4.7840
42
General: 4.7646
43
DNS: 4.7589
44
Python: 4.7483
45
Azu