In [None]:
# !pip install pyspark
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from datetime import datetime
import seaborn as sns
from collections import Counter
from itertools import combinations

try:
    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder
    MLXTEND_AVAILABLE = True
except ImportError:
    print("mlxtend not available. Install with: pip install mlxtend")
    MLXTEND_AVAILABLE = False
############################### FIX IMPORT #####################################

In [None]:
n = 10 #Global variable

In [None]:
dataFull = pd.read_csv('/file.csv')
print("Raw schema: \n",dataFull.head(5))

# Drop the columns
if 'file_tree' in dataFull.columns or 'Unnamed: 0' in dataFull.columns:
    data = dataFull.drop(columns=[col for col in ['file_tree', 'Unnamed: 0'] if col in dataFull.columns])
    print("Final schema: \n",data.head())
    print('\n-----------------------------------------------------------------------------\n')
else:
  data = dataFull


# How many instances have null values if any
if(data.isnull().values.any()):
  data.isnull().sum()
  print('\n-----------------------------------------------------------------------------\n')

# unique values in columns
print('\n-----------------------------------------------------------------------------\n')
print(data.nunique())

Raw schema: 
                          id                 date     user       pc  \
0  {L9G8-J9QE34VM-2834VDPB}  01/02/2010 07:23:14  MOH0273  PC-6699   
1  {H0W6-L4FG38XG-9897XTEN}  01/02/2010 07:26:19  MOH0273  PC-6699   
2  {M3Z0-O2KK89OX-5716MBIM}  01/02/2010 08:12:03  HPH0075  PC-2417   
3  {E1I4-S4QS61TG-3652YHKR}  01/02/2010 08:17:00  HPH0075  PC-2417   
4  {D4R7-E7JL45UX-0067XALT}  01/02/2010 08:24:57  HSB0196  PC-8001   

       filename                                            content  
0  EYPC9Y08.doc  D0-CF-11-E0-A1-B1-1A-E1 during difficulty over...  
1  N3LTSU3O.pdf  25-50-44-46-2D carpenters 25 landed strait dis...  
2  D3D3WC9W.doc  D0-CF-11-E0-A1-B1-1A-E1 union 24 declined impo...  
3  QCSW62YS.doc  D0-CF-11-E0-A1-B1-1A-E1 becoming period begin ...  
4  AU75JV6U.jpg                                              FF-D8  

-----------------------------------------------------------------------------

id          445581
date        432924
user           264
pc            

In [None]:

print(f"Original data shape: {data.shape}")

# FREQUENT PATTERN MINING
print(f"Dataset size: {len(data):,} records") #comma to add separator at every thousand
print(f"Unique items in transactions: Users({data['user'].nunique():,}), PCs({data['pc'].nunique():,})")

Original data shape: (445581, 6)
Dataset size: 445,581 records
Unique items in transactions: Users(264), PCs(956)


In [None]:
# STOP#########################################################

In [None]:
user_to_pcs = pd.read_csv('device_user_to_pcs.csv')

# pc_summary = data.groupby('user')['pc'].agg(', '.join).reset_index()
pc_summary = (
    data.groupby('user')['pc']
    .agg(lambda pcs: sorted(set(pcs)))
    .reset_index()
    .rename(columns={'pc': 'pc_file'})
)

user_to_pcs = user_to_pcs.merge(
    pc_summary,
    on='user',
    how='left'
)

user_to_pcs['pc_file_in_accessed'] = user_to_pcs.apply(
    lambda row: any(pc in row['pcs_accessed'] for pc in row['pc_file']),
    axis=1
)

user_to_pcs['pc_file_count'] = user_to_pcs['pc_file'].apply(len)

In [None]:
user_to_pcs.head(20)

Unnamed: 0,user,pcs_accessed,total_accesses,unique_pc_count,has_after_hours,OCEAN,OCEAN_level,pc_file,pc_file_in_accessed,pc_file_count
0,AJF0370,"['PC-0004', 'PC-0008', 'PC-0039', 'PC-0044', '...",8502,642,True,132,High,"[PC-0004, PC-0039, PC-0044, PC-0072, PC-0092, ...",True,417
1,HDS0367,"['PC-0004', 'PC-0008', 'PC-0044', 'PC-0072', '...",2722,613,True,157,High,"[PC-0004, PC-0044, PC-0072, PC-0092, PC-0094, ...",True,386
2,BAL0044,"['PC-0039', 'PC-0044', 'PC-0072', 'PC-0094', '...",2258,607,True,155,High,"[PC-0072, PC-0118, PC-0148, PC-0151, PC-0166, ...",True,379
3,EIS0041,"['PC-0004', 'PC-0092', 'PC-0118', 'PC-0120', '...",2644,607,True,180,High,"[PC-0004, PC-0092, PC-0118, PC-0120, PC-0146, ...",True,381
4,IBB0359,"['PC-0004', 'PC-0008', 'PC-0072', 'PC-0094', '...",7852,580,True,183,High,"[PC-0004, PC-0008, PC-0094, PC-0115, PC-0120, ...",True,363
5,BSS0369,"['PC-0092', 'PC-0094', 'PC-0115', 'PC-0118', '...",5472,489,True,161,High,"[PC-0092, PC-0115, PC-0118, PC-0120, PC-0132, ...",True,282
6,MPM0220,"['PC-0004', 'PC-0039', 'PC-0072', 'PC-0094', '...",4978,440,True,139,High,"[PC-0004, PC-0039, PC-0120, PC-0377, PC-0384, ...",True,244
7,CCA0046,"['PC-0004', 'PC-0008', 'PC-0039', 'PC-0094', '...",1511,430,True,147,High,"[PC-0004, PC-0133, PC-0174, PC-0215, PC-0351, ...",True,252
8,MOS0047,"['PC-0004', 'PC-0044', 'PC-0094', 'PC-0133', '...",1774,391,True,169,High,"[PC-0044, PC-0183, PC-0205, PC-0235, PC-0252, ...",True,214
9,GTD0219,"['PC-0004', 'PC-0132', 'PC-0164', 'PC-0215', '...",957,291,True,162,High,"[PC-0164, PC-0215, PC-0216, PC-0290, PC-0320, ...",True,162


In [None]:
print("=== USER-PC ACCESS ANALYSIS ===\n")
df = data.copy()
print("1. ALL USERS WHO HAVE ACCESS TO EACH PC")
print("="*50)

# Group by PC and get list of all users who accessed it
pc_to_users = df.groupby('pc').agg({
    'user': lambda x: sorted(list(set(x))),  # Unique users, sorted
    'id': 'count'  # Total access count for this PC
}).reset_index()

pc_to_users.columns = ['pc', 'users_with_access', 'total_accesses']
pc_to_users['unique_user_count'] = pc_to_users['users_with_access'].apply(len)

# Filter out PCs accessed by only one user
pc_to_users = pc_to_users[pc_to_users['unique_user_count'] > 1]

# Sort by number of users (most shared PCs first)
pc_to_users = pc_to_users.sort_values('unique_user_count', ascending=False)

print(f"PCs with multiple users: {len(pc_to_users)}")
print(f"Most shared PC has {pc_to_users['unique_user_count'].max()} different users")
print(f"Least shared PC has {pc_to_users['unique_user_count'].min()} different users")

print("2. ALL PCs THAT EACH USER TOUCHES")
print("="*50)

# Group by User and get list of all PCs they accessed
user_to_pcs = df.groupby('user').agg({
    'pc': lambda x: sorted(list(set(x))),  # Unique PCs, sorted
    'id': 'count'  # Total access count for this user
}).reset_index()

user_to_pcs.columns = ['user', 'pcs_accessed', 'total_accesses']
user_to_pcs['unique_pc_count'] = user_to_pcs['pcs_accessed'].apply(len)

# Filter out users who access only one PC
user_to_pcs = user_to_pcs[user_to_pcs['unique_pc_count'] > 1]

# Sort by number of PCs accessed (users with broadest access first)
user_to_pcs = user_to_pcs.sort_values('unique_pc_count', ascending=False)

print(f"Users with access to multiple PCs: {len(user_to_pcs)}")
print(f"User with broadest access touches {user_to_pcs['unique_pc_count'].max()} different PCs")
print(f"User with narrowest access touches {user_to_pcs['unique_pc_count'].min()} different PCs")


=== USER-PC ACCESS ANALYSIS ===

1. ALL USERS WHO HAVE ACCESS TO EACH PC
PCs with multiple users: 856
Most shared PC has 9 different users
Least shared PC has 2 different users
2. ALL PCs THAT EACH USER TOUCHES
Users with access to multiple PCs: 10
User with broadest access touches 417 different PCs
User with narrowest access touches 162 different PCs
