#Phase 1


#Dataset Cleaning

In [3]:
import pandas as pd

df = pd.read_csv("/content/dataset.csv")

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12673 entries, 0 to 12672
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   password        12673 non-null  object 
 1   strength        12673 non-null  float64
 2   length          12673 non-null  int64  
 3   class_strength  12673 non-null  object 
 4   entropy         12673 non-null  float64
 5   crack_time_sec  12673 non-null  float64
 6   crack_time      12672 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 693.2+ KB


In [14]:
#fixing the typing errors
df["class_strength"] = df["class_strength"].replace("Very week", "Very weak")
df["class_strength"] = df["class_strength"].replace("Week", "Weak")

df = df.drop_duplicates(subset="password")

#remove extreme values from crack_time_sec (can be ignored)
df = df[df["crack_time_sec"] < 1e+20]
df = df.dropna(subset=["crack_time"])

print("unique class_strength values:", df["class_strength"].unique())
print("dataset shape:", df.shape)

unique class_strength values: ['Very weak' 'Weak' 'Average']
dataset shape: (12649, 7)


In [5]:
df.describe()

Unnamed: 0,strength,length,entropy,crack_time_sec
count,12650.0,12650.0,12650.0,12650.0
mean,0.256778,8.267747,25.7699,78238070000.0
std,0.136144,2.541429,11.384591,8795149000000.0
min,0.0,4.0,8.0,1.28e-07
25%,0.154795,6.0,15.509775,2.3328e-05
50%,0.249543,8.0,24.0,0.008388608
75%,0.365153,10.0,33.219281,5.0
max,0.597691,19.0,80.710623,989209800000000.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12650 entries, 0 to 12672
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   password        12650 non-null  object 
 1   strength        12650 non-null  float64
 2   length          12650 non-null  int64  
 3   class_strength  12650 non-null  object 
 4   entropy         12650 non-null  float64
 5   crack_time_sec  12650 non-null  float64
 6   crack_time      12649 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 790.6+ KB


#Validation Schema

In [15]:
!pip install pandera

import pandera as pa
from pandera import Column, DataFrameSchema, Check

schema = pa.DataFrameSchema({
    "password": Column(str, nullable=False),
    "strength": Column(float, Check.in_range(0.0, 1.0)),
    "length": Column(int, Check.in_range(4, 63)),
    "class_strength": Column(str, Check.isin([
        "Very weak", "Weak", "Average", "Strong", "Very strong"
    ])),
    "entropy": Column(float, Check.in_range(8.0, 400.0)),
    "crack_time_sec": Column(float, Check.gt(0)),
    "crack_time": Column(str, nullable=False)
})

validated_df = schema(df)
print("validation done")
print(schema)


validation done
<Schema DataFrameSchema(
    columns={
        'password': <Schema Column(name=password, type=DataType(str))>
        'strength': <Schema Column(name=strength, type=DataType(float64))>
        'length': <Schema Column(name=length, type=DataType(int64))>
        'class_strength': <Schema Column(name=class_strength, type=DataType(str))>
        'entropy': <Schema Column(name=entropy, type=DataType(float64))>
        'crack_time_sec': <Schema Column(name=crack_time_sec, type=DataType(float64))>
        'crack_time': <Schema Column(name=crack_time, type=DataType(str))>
    },
    checks=[],
    parsers=[],
    coerce=False,
    dtype=None,
    index=None,
    strict=False,
    name=None,
    ordered=False,
    unique_column_names=False,
    metadata=None, 
    add_missing_columns=False
)>


#Lazy Validation with Error Display

In [16]:
try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as exc:
    display(exc.failure_cases)



#Phase 2


GDPR & CCPA

In [19]:
!pip install pycryptodome

import pandas as pd
import hashlib
from Crypto.Cipher import AES
import os
import logging

Collecting pycryptodome
  Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.22.0


In [20]:
#(GDPR/CCPA)
def anonymize_data(data):
    # hashing dataaaa to create a unique hash
    return hashlib.sha256(data.encode()).hexdigest()

# apply anonymization to the password column
df['anonymized_password'] = df['password'].apply(anonymize_data)

# display the anonymized password column
df[['password', 'anonymized_password']].head()


Unnamed: 0,password,anonymized_password
0,bybee,fd3f357c92663c5a5c8a5b02cf6e74c20ccf7be5699560...
1,n3m0,9d6ee841cb232ef73a84db7a1c7f5c1311c2e61396f3e1...
2,2509,fd98aa9d9cc36596d20ab1947351b2f2312de7edbd2d2e...
3,4622,e7110f8dba36e4f13f369bc30feb3ad528a30a04943602...
4,shrk,cdd3ca8db5a12893249c025564e8809a1696d40bef374c...


Pseudonymization of Data


In [21]:
# pseudonymization of the password column
def pseudonymize_data(data):
    # hashing data to create a pseudonym
    return hashlib.sha256(data.encode()).hexdigest()

# applying
df['pseudonymized_password'] = df['password'].apply(pseudonymize_data)

# display
df[['password', 'pseudonymized_password']].head()


Unnamed: 0,password,pseudonymized_password
0,bybee,fd3f357c92663c5a5c8a5b02cf6e74c20ccf7be5699560...
1,n3m0,9d6ee841cb232ef73a84db7a1c7f5c1311c2e61396f3e1...
2,2509,fd98aa9d9cc36596d20ab1947351b2f2312de7edbd2d2e...
3,4622,e7110f8dba36e4f13f369bc30feb3ad528a30a04943602...
4,shrk,cdd3ca8db5a12893249c025564e8809a1696d40bef374c...


 Data Encryption (AES Encryption)

In [22]:
# data encryption
def encrypt_data(data):
    key = os.urandom(16)  #generate a 16-byte key
    cipher = AES.new(key, AES.MODE_EAX)
    ciphertext, tag = cipher.encrypt_and_digest(data.encode())
    return cipher.nonce + tag + ciphertext #combo

# encrypt the pass column
df['encrypted_password'] = df['password'].apply(lambda x: encrypt_data(x))

# display
df[['password', 'encrypted_password']].head()


Unnamed: 0,password,encrypted_password
0,bybee,b'\x0cN\x1chs\xbb\xaf\xbc\xd50\xfd\x95d\x8d\x9...
1,n3m0,b'\x93\xd6p[*\x8e\xb9\x16\xb2\x8f\xc7F6\xf1&/\...
2,2509,b'\x84\x9e`\x03\xdf\xdb\xa3J\xfb \xad\xa2J\xbd...
3,4622,"b""k\xfc/K\xa5\x7f\xf0\xdd\xd6\xb0\xd1\x12=l5\x..."
4,shrk,"b'\xf6\xeb\xdc\x99E-c\xd16,\xeb\xe5\x04\xe1\x0..."


Hashing for Additional Security

In [23]:
# hashing(SHA-256)
df['hashed_password'] = df['password'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

# display
df[['password', 'hashed_password']].head()


Unnamed: 0,password,hashed_password
0,bybee,fd3f357c92663c5a5c8a5b02cf6e74c20ccf7be5699560...
1,n3m0,9d6ee841cb232ef73a84db7a1c7f5c1311c2e61396f3e1...
2,2509,fd98aa9d9cc36596d20ab1947351b2f2312de7edbd2d2e...
3,4622,e7110f8dba36e4f13f369bc30feb3ad528a30a04943602...
4,shrk,cdd3ca8db5a12893249c025564e8809a1696d40bef374c...


 Logging Data Access for HIPAA

In [24]:
# logging data access for HIPAA (keeping an access log 4 all actions)
logging.basicConfig(filename='access_log.txt', level=logging.INFO)

def log_access(user, action):
    logging.info(f"User: {user}, Action: {action}, Timestamp: {pd.Timestamp.now()}")

# Ex: logging an action
log_access('alice', 'view data')


RBAC (Role-Based Access Control)

In [25]:
# checking access permissions with RBAC
roles = {
    'admin': ['view', 'edit', 'delete'],
    'user': ['view'],
}

users = {
    'alice': 'admin',
    'bob': 'user',
}

# func to check access permissions and log the activity
def check_access(user, action):
    role = users.get(user, None)
    if role:
        permissions = roles.get(role, [])
        if action in permissions:
            log_access(user, action)  #action
            return True
        else:
            log_access(user, 'access denied')  # log denied access
            return False
    else:
        return False

# Ex: usage of checking permissions
if check_access('alice', 'delete'):
    print("Alice has delete access")
else:
    print("Alice does not have delete access")

if check_access('bob', 'delete'):
    print("Bob has delete access")
else:
    print("Bob does not have delete access")


Alice has delete access
Bob does not have delete access


 Final Output

In [26]:
print("First few rows after applying security and privacy measures:")
print(df[['password', 'anonymized_password', 'pseudonymized_password', 'encrypted_password', 'hashed_password']].head())


First few rows after applying security and privacy measures:
  password                                anonymized_password  \
0    bybee  fd3f357c92663c5a5c8a5b02cf6e74c20ccf7be5699560...   
1     n3m0  9d6ee841cb232ef73a84db7a1c7f5c1311c2e61396f3e1...   
2     2509  fd98aa9d9cc36596d20ab1947351b2f2312de7edbd2d2e...   
3     4622  e7110f8dba36e4f13f369bc30feb3ad528a30a04943602...   
4     shrk  cdd3ca8db5a12893249c025564e8809a1696d40bef374c...   

                              pseudonymized_password  \
0  fd3f357c92663c5a5c8a5b02cf6e74c20ccf7be5699560...   
1  9d6ee841cb232ef73a84db7a1c7f5c1311c2e61396f3e1...   
2  fd98aa9d9cc36596d20ab1947351b2f2312de7edbd2d2e...   
3  e7110f8dba36e4f13f369bc30feb3ad528a30a04943602...   
4  cdd3ca8db5a12893249c025564e8809a1696d40bef374c...   

                                  encrypted_password  \
0  b'\x0cN\x1chs\xbb\xaf\xbc\xd50\xfd\x95d\x8d\x9...   
1  b'\x93\xd6p[*\x8e\xb9\x16\xb2\x8f\xc7F6\xf1&/\...   
2  b'\x84\x9e`\x03\xdf\xdb\xa3J\xfb \xad\xa

Caesar Cipher Encryption

In [28]:
def caesar_encrypt(text, shift=3):
    result = ''
    for char in text:
        if char.isalpha():
            base = ord('A') if char.isupper() else ord('a')
            result += chr((ord(char) - base + shift) % 26 + base)
        else:
            result += char
    return result

df['caesar_encrypted_password'] = df['password'].apply(lambda x: caesar_encrypt(str(x)))


#Phase 3 & Bonus

collect cipher texts

In [29]:
# collect all caesar-encrypted passwords into one text block
cipher_texts = df['caesar_encrypted_password'].dropna().astype(str).tolist()
all_cipher_text = ' '.join(cipher_texts)


count letter frequency

In [30]:
from collections import Counter

# count how often each letter appears
char_freq = Counter(ch for ch in all_cipher_text.lower() if ch.isalpha())
sorted_letters = [item[0] for item in char_freq.most_common()]

print("most common encrypted letters:", sorted_letters[:10])


most common encrypted letters: ['d', 'h', 'l', 'r', 'q', 'o', 'u', 'v', 'w', 'p']


build freq map

In [31]:
# order of most common letters in english
common_letters = list("etaoinshrdlcumwfgypbvkjxqz")

# map each frequent encrypted letter to its english counterpart
char_map = {}
for i in range(len(sorted_letters)):
    if i < len(common_letters):
        char_map[sorted_letters[i]] = common_letters[i]

print("example mapping:", list(char_map.items())[:5])


example mapping: [('d', 'e'), ('h', 't'), ('l', 'a'), ('r', 'o'), ('q', 'i')]


 freq attack func

In [32]:
# function to decrypt using our guessed mapping
def decrypt_by_freq(text, mapping):
    result = ''
    for ch in text:
        if ch.lower() in mapping:
            guess = mapping[ch.lower()]
            result += guess.upper() if ch.isupper() else guess
        else:
            result += ch
    return result


apply decryption column

In [33]:
# apply the decryption to each row
df['freq_decrypted_password'] = df['caesar_encrypted_password'].apply(lambda x: decrypt_by_freq(str(x), char_map))


show comparison

In [34]:
# print sample to compare real vs encrypted vs guessed
print(df[['password', 'caesar_encrypted_password', 'freq_decrypted_password']].head(5))


  password caesar_encrypted_password freq_decrypted_password
0    bybee                     ebehh                   fwftt
1     n3m0                      q3p0                    i3d0
2     2509                      2509                    2509
3     4622                      4622                    4622
4     shrk                      vkun                    husg


match score

In [35]:
# compare how many characters matched
def calc_match_score(a, b):
    a = str(a)
    b = str(b)
    count = sum(1 for x, y in zip(a, b) if x == y)
    return count / max(len(a), 1)

df['similarity'] = df.apply(lambda row: calc_match_score(row['password'], row['freq_decrypted_password']), axis=1)

print("avg similarity score:", df['similarity'].mean())


average similarity score: 0.4254664356470823
