In [None]:
# !pip install pyspark
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from datetime import datetime
import seaborn as sns
from collections import Counter
from itertools import combinations

try:
    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder
    MLXTEND_AVAILABLE = True
except ImportError:
    print("mlxtend not available. Install with: pip install mlxtend")
    MLXTEND_AVAILABLE = False
############################### FIX IMPORT #####################################

In [None]:
n = 10 #Global variable

In [None]:
dataFull = pd.read_csv('device.csv')
print("Raw schema: \n",dataFull.head())

# Drop the columns
if 'file_tree' in dataFull.columns or 'Unnamed: 0' in dataFull.columns:
    data = dataFull.drop(columns=[col for col in ['file_tree', 'Unnamed: 0'] if col in dataFull.columns])
    print("Final schema: \n",data.head())
else:
  data = dataFull
print('\n-----------------------------------------------------------------------------\n')

# How many instances have null values if any
if(data.isnull().values.any()):
  data.isnull().sum()
  print('\n-----------------------------------------------------------------------------\n')

# unique values in columns
print(data.nunique())
print('\n-----------------------------------------------------------------------------\n')

Raw schema: 
                          id                 date     user       pc    activity
0  {J1S3-L9UU75BQ-7790ATPL}  01/02/2010 07:21:06  MOH0273  PC-6699     Connect
1  {N7B5-Y7BB27SI-2946PUJK}  01/02/2010 07:37:41  MOH0273  PC-6699  Disconnect
2  {U1V9-Z7XT67KV-5649MYHI}  01/02/2010 07:59:11  HPH0075  PC-2417     Connect
3  {H0Z7-E6GB57XZ-1603MOXD}  01/02/2010 07:59:49  IIW0249  PC-0843     Connect
4  {L7P2-G4PX02RX-7999GYOY}  01/02/2010 08:04:26  IIW0249  PC-0843  Disconnect

-----------------------------------------------------------------------------

id          405380
date        399631
user           265
pc             971
activity         2
dtype: int64

-----------------------------------------------------------------------------



In [None]:
# Suppress specific deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Create DataFrame from data
columns = ['id', 'date', 'user', 'pc', 'activity']
df = pd.DataFrame(data, columns=columns)

print(f"Original data shape: {df.shape}")

# FREQUENT PATTERN MINING
print(f"Dataset size: {len(df):,} records") #comma to add separator at every thousand
print(f"Unique items in transactions: Users({df['user'].nunique():,}), PCs({df['pc'].nunique():,})")


Original data shape: (405380, 5)
Dataset size: 405,380 records
Unique items in transactions: Users(265), PCs(971)


In [None]:
# === LOGIN/LOGOUT ANALYSIS USING BUILT-IN FUNCTIONS ===
# Count logins per user using value_counts
login_data = df[df['activity'] == 'Connect']
user_logins = login_data['user'].value_counts().reset_index()
user_logins.columns = ['user', 'login_count']

print(f"\nTop 10 users by login count:")
print(user_logins.head(n))

# Count logins and logoffs per user using crosstab
activity_counts = pd.crosstab(df['user'], df['activity'], margins=False)
activity_counts = activity_counts.reindex(columns=['Connect', 'Disconnect'], fill_value=0)
activity_counts.columns = ['login_count', 'logoff_count']
activity_counts['in_out_difference'] = activity_counts['login_count'] - activity_counts['logoff_count']
activity_counts = activity_counts.sort_values('in_out_difference', ascending=False).reset_index()

print(f"\nUsers with highest login/logout differences:")
print(activity_counts.head(n))


Top 10 users by login count:
      user  login_count
0  AJF0370         4261
1  IBB0359         3935
2  LBH0942         3903
3  HSB0196         3805
4  DLM0051         3775
5  OBH0499         3706
6  IIW0249         3375
7  KKW0879         3257
8  HPH0075         3180
9  MOH0273         3155

Users with highest login/logout differences:
      user  login_count  logoff_count  in_out_difference
0  JDB0169         2743          2721                 22
1  LBH0942         3903          3882                 21
2  AJF0370         4261          4241                 20
3  DBB0384         2976          2956                 20
4  IBB0359         3935          3917                 18
5  THR0873         2742          2724                 18
6  OBH0499         3706          3689                 17
7  BJM0111         2919          2902                 17
8  DLM0051         3775          3758                 17
9  RNP0211         1073          1056                 17


In [None]:
# === WORK HOURS ANALYSIS ===
# Convert date to datetime using pd.to_datetime
df['datetime'] = pd.to_datetime(df['date'], format='%m/%d/%Y %H:%M:%S')
df['hour'] = df['datetime'].dt.hour

# Check if access occurred outside work hours (7 AM to 7 PM)
df['outside_work_hours'] = (df['hour'] < 7) | (df['hour'] >= 19)

# Join with activity counts
df_extended = df.merge(activity_counts, on='user', how='left')

# Filter and analyze after-hours access
after_hours_summary = df_extended.groupby('user').agg({
    'outside_work_hours': ['sum', 'count'],
    'activity': 'count'
}).reset_index()

after_hours_summary.columns = ['user', 'after_hours_count', 'total_after_hours_records', 'total_records']
after_hours_summary['after_hours_percentage'] = (after_hours_summary['after_hours_count'] /
                                                after_hours_summary['total_records'] * 100)
after_hours_summary = after_hours_summary.sort_values('after_hours_count', ascending=False)

print(f"\nAfter-hours access summary (top 10 users):")
print(after_hours_summary.head(10))

# Show connect events outside work hours
connect_after_hours = df_extended[(df_extended['activity'] == 'Connect') &
                                 (df_extended['outside_work_hours'] == True)]
print(f"\nConnect events outside work hours: {len(connect_after_hours)}")
if len(connect_after_hours) > 0:
    print("Sample after-hours connections:")
    print(connect_after_hours[['user', 'pc', 'date', 'hour']].head(10))


After-hours access summary (top 10 users):
        user  after_hours_count  total_after_hours_records  total_records  \
8    AJF0370               1594                       8502           8502   
106  HDS0367               1530                       2722           2722   
79   EIS0041               1524                       2644           2644   
118  IBB0359               1501                       7852           7852   
20   BAL0044               1486                       2258           2258   
113  HRL0540               1233                       5924           5924   
36   BRM0995               1163                       5932           5932   
186  MPM0220                987                       4978           4978   
38   BSS0369                945                       5472           5472   
48   CCA0046                871                       1511           1511   

     after_hours_percentage  
8                 18.748530  
106               56.208670  
79                

In [None]:
# Use groupby with multiple aggregations
pc_access_analysis = df.groupby(['user', 'pc']).agg({
    'id': 'count',  # access_count
    'activity': 'count'  # redundant but keeping for consistency
}).reset_index()
pc_access_analysis.columns = ['user', 'pc', 'access_count', 'activity_count']

# Count unique users per PC and unique PCs per user using nunique
pc_user_stats = df.groupby('pc')['user'].nunique().reset_index()
pc_user_stats.columns = ['pc', 'unique_user_per_pc_count']

user_pc_stats = df.groupby('user')['pc'].nunique().reset_index()
user_pc_stats.columns = ['user', 'unique_pc_per_user']

# Filter users with reasonable PC access
user_pc_stats_filtered = user_pc_stats[user_pc_stats['unique_pc_per_user'] > 0] #<500

# Merge all statistics
pc_access_final = pc_access_analysis.merge(user_pc_stats_filtered, on='user', how='inner')
pc_access_final = pc_access_final.merge(pc_user_stats, on='pc', how='left')

# Select and reorder columns
pc_access_final = pc_access_final[['pc', 'user', 'access_count', 'unique_user_per_pc_count', 'unique_pc_per_user']]
pc_access_final = pc_access_final.sort_values(['unique_user_per_pc_count', 'pc', 'user'], ascending=[False, True, True])

print(f"\nPC access analysis (top 20):")
print(pc_access_final.sort_values(by='unique_pc_per_user', ascending=True).head(20))



PC access analysis (top 20):
           pc     user  access_count  unique_user_per_pc_count  \
4384  PC-5775  IUB0565           986                         7   
3164  PC-8314  HCL0651          1320                         8   
1777  PC-0008  BVC0790          1749                         5   
5315  PC-2930  RSM0277           780                         3   
2228  PC-3737  CWR0502           694                         8   
3784  PC-8052  HMM0108           198                         3   
2244  PC-8669  EHD0584            14                         5   
5326  PC-5923  SMY0792           758                         6   
3166  PC-3054  HCS0003          2006                         9   
1275  PC-2661  BIH0745            22                         5   
4374  PC-6382  IBS0836           774                         7   
5313  PC-3735  RRP0568           757                         8   
657   PC-1313  ATE0869           419                         7   
4415  PC-2638  KPC0073            16          

In [None]:
# Search for specific PCs and users using isin()
pc_list = ["PC-0843"]
user_list = ["IIW0249"]

specific_pcs = pc_access_final[pc_access_final['pc'].isin(pc_list)]
specific_users = pc_access_final[pc_access_final['user'].isin(user_list)]

if len(specific_pcs) > 0:
    print(f"\nSpecific PCs analysis:")
    print(specific_pcs)

if len(specific_users) > 0:
    print(f"\nSpecific users analysis:")
    print(specific_users)


Specific PCs analysis:
           pc     user  access_count  unique_user_per_pc_count  \
73    PC-0843  AJF0370             4                        10   
713   PC-0843  BAL0044             4                        10   
1329  PC-0843  BSS0369             4                        10   
1825  PC-0843  CCA0046             8                        10   
2299  PC-0843  EIS0041             4                        10   
2895  PC-0843  GTD0219             2                        10   
3242  PC-0843  HDS0367             8                        10   
3854  PC-0843  IBB0359             4                        10   
4376  PC-0843  IIW0249          6735                        10   
4872  PC-0843  MPM0220             2                        10   

      unique_pc_per_user  
73                   642  
713                  607  
1329                 489  
1825                 430  
2299                 607  
2895                 291  
3242                 613  
3854                 580  
4376  

In [None]:
user_summary = df.groupby('user').agg({
    'pc': ['nunique', 'count'],  # unique PCs and total access count
    'activity': lambda x: list(x.unique()),  # list of activities
    'outside_work_hours': ['sum', 'any'],  # count and flag for after-hours
    'datetime': ['min', 'max']  # first and last access
}).reset_index()

# Flatten column names
user_summary.columns = ['user', 'unique_pc_count', 'total_access_count', 'activity_types',
                       'after_hours_count', 'has_after_hours', 'first_access', 'last_access']

# Add PC lists using groupby and apply
pc_lists = df.groupby('user')['pc'].apply(lambda x: sorted(list(x.unique()))).reset_index()
pc_lists.columns = ['user', 'pc_list']
user_summary = user_summary.merge(pc_lists, on='user')

# Sort by unique PC count
user_summary = user_summary.sort_values('unique_pc_count', ascending=False)

print(f"\nComprehensive User Analysis (top 15):")
display_cols = ['user', 'unique_pc_count', 'total_access_count', 'after_hours_count', 'has_after_hours']
print(user_summary[display_cols].head(15))

# Find users with access to all PCs
max_pc_count = df['pc'].nunique()
users_with_full_access = user_summary[user_summary['unique_pc_count'] == max_pc_count]

print(f"\nDataset Summary:")
print(f"Total unique PCs: {max_pc_count}")
print(f"Users with access to all PCs: {len(users_with_full_access)}")

if len(users_with_full_access) > 0:
    print("Users with full PC access:")
    print(users_with_full_access[['user', 'unique_pc_count', 'total_access_count']])


Comprehensive User Analysis (top 15):
        user  unique_pc_count  total_access_count  after_hours_count  \
8    AJF0370              642                8502               1594   
106  HDS0367              613                2722               1530   
20   BAL0044              607                2258               1486   
79   EIS0041              607                2644               1524   
118  IBB0359              580                7852               1501   
38   BSS0369              489                5472                945   
186  MPM0220              440                4978                987   
48   CCA0046              430                1511                871   
185  MOS0047              391                1774                781   
97   GTD0219              291                 957                521   
139  JGT0221                2                   4                  0   
143  JLM0364                2                   4                  2   
149  JTM0223             

In [None]:
print(f"\n=== FINAL SUMMARY ===")
print(f"Total records: {len(df):,}")
print(f"Unique users: {df['user'].nunique():,}")
print(f"Unique PCs: {df['pc'].nunique():,}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Connect events: {(df['activity'] == 'Connect').sum():,}")
print(f"Disconnect events: {(df['activity'] == 'Disconnect').sum():,}")
print(f"After-hours events: {df['outside_work_hours'].sum():,}")
print(f"Users with after-hours access: {user_summary['has_after_hours'].sum():,}")

# Activity distribution
activity_dist = df['activity'].value_counts()
print(f"\nActivity Distribution:")
for activity, count in activity_dist.items():
    print(f"  {activity}: {count:,} ({count/len(df)*100:.1f}%)")

# Time-based analysis
hourly_activity = df.groupby(df['datetime'].dt.hour).size()
peak_hour = hourly_activity.idxmax()
print(f"\nPeak activity hour: {peak_hour}:00 with {hourly_activity[peak_hour]:,} events")


=== FINAL SUMMARY ===
Total records: 405,380
Unique users: 265
Unique PCs: 971
Date range: 2010-01-02 07:21:06 to 2011-05-16 23:22:34
Connect events: 203,339
Disconnect events: 202,041
After-hours events: 17,959
Users with after-hours access: 114

Activity Distribution:
  Connect: 203,339 (50.2%)
  Disconnect: 202,041 (49.8%)

Peak activity hour: 14:00 with 46,288 events


In [None]:
print("=== USER-PC ACCESS ANALYSIS ===\n")

# ALL USERS WHO HAVE ACCESS TO EACH PC
print("1. ALL USERS WHO HAVE ACCESS TO EACH PC")
print("="*80)

# Group by PC and get list of all users who accessed it
pc_to_users = df.groupby('pc').agg({
    'user': lambda x: sorted(list(set(x))),  # Unique users, sorted
    'id': 'count'  # Total access count for this PC
}).reset_index()

pc_to_users.columns = ['pc', 'users_with_access', 'total_accesses']
pc_to_users['unique_user_count'] = pc_to_users['users_with_access'].apply(len)

# Sort by number of users (most shared PCs first)
pc_to_users = pc_to_users.sort_values('unique_user_count', ascending=False)

print(f"Total PCs in system: {len(pc_to_users)}")
print(f"Most shared PC has {pc_to_users['unique_user_count'].max()} different users")
print(f"Least shared PC has {pc_to_users['unique_user_count'].min()} different users")
print("\nTop 10 most shared PCs:")

for idx, row in pc_to_users.head(10).iterrows():
    print(f"\nPC: {row['pc']}")
    print(f"  Users with access ({row['unique_user_count']}): {', '.join(row['users_with_access'][:10])}")
    if len(row['users_with_access']) > 10:
        print(f"   ... and {len(row['users_with_access']) - 10} more users")
    print(f" Total accesses: {row['total_accesses']}")

print("\n" + "="*80 + "\n")

=== USER-PC ACCESS ANALYSIS ===

1. ALL USERS WHO HAVE ACCESS TO EACH PC
Total PCs in system: 971
Most shared PC has 12 different users
Least shared PC has 1 different users

Top 10 most shared PCs:

PC: PC-5866
  Users with access (12): AJF0370, BAL0044, BBS0039, CCA0046, CSC0217, GTD0219, HDS0367, IBB0359, JGT0221, JTM0223
   ... and 2 more users
 Total accesses: 30

PC: PC-3476
  Users with access (10): AJF0370, BAL0044, BSS0369, CCA0046, EIS0041, GTD0219, HDS0367, IBB0359, MPM0220, MVB0515
 Total accesses: 761

PC: PC-8734
  Users with access (10): AJF0370, BAL0044, BSS0369, EIS0041, GTD0219, HDS0367, IBB0359, IJM0444, MOS0047, MPM0220
 Total accesses: 765

PC: PC-8245
  Users with access (10): AIB0948, AJF0370, BAL0044, BSS0369, CCA0046, EIS0041, GTD0219, HDS0367, MOS0047, MPM0220
 Total accesses: 5989

PC: PC-7384
  Users with access (10): AJF0370, BAL0044, BSS0369, CCA0046, GTD0219, HBO0413, HDS0367, IBB0359, MOS0047, MPM0220
 Total accesses: 1432

PC: PC-3962
  Users with acces

In [None]:
# ALL PCs THAT EACH USER TOUCHES
print("2. ALL PCs THAT EACH USER TOUCHES")
print("="*80)

# Group by User and get list of all PCs they accessed
user_to_pcs = df.groupby('user').agg({
    'pc': lambda x: sorted(list(set(x))),  # Unique PCs, sorted
    'id': 'count'  # Total access count for this user
}).reset_index()

user_to_pcs.columns = ['user', 'pcs_accessed', 'total_accesses']
user_to_pcs['unique_pc_count'] = user_to_pcs['pcs_accessed'].apply(len)

# Sort by number of PCs accessed (users with broadest access first)
user_to_pcs = user_to_pcs.sort_values('unique_pc_count', ascending=False)

print(f"Total users in system: {len(user_to_pcs)}")
print(f"User with broadest access touches {user_to_pcs['unique_pc_count'].max()} different PCs")
print(f"User with narrowest access touches {user_to_pcs['unique_pc_count'].min()} different PCs")
print("\nTop 10 users with broadest PC access:")

for idx, row in user_to_pcs.head(10).iterrows():
    print(f"\nUser: {row['user']}")
    print(f"PCs accessed ({row['unique_pc_count']}): {', '.join(row['pcs_accessed'][:10])}")
    if len(row['pcs_accessed']) > 10:
        print(f"   ... and {len(row['pcs_accessed']) - 10} more PCs")
    print(f"Total accesses: {row['total_accesses']}")

print("\n" + "="*80 + "\n")

2. ALL PCs THAT EACH USER TOUCHES
Total users in system: 265
User with broadest access touches 642 different PCs
User with narrowest access touches 1 different PCs

Top 10 users with broadest PC access:

User: AJF0370
PCs accessed (642): PC-0004, PC-0008, PC-0039, PC-0044, PC-0072, PC-0092, PC-0115, PC-0120, PC-0133, PC-0141
   ... and 632 more PCs
Total accesses: 8502

User: HDS0367
PCs accessed (613): PC-0004, PC-0008, PC-0044, PC-0072, PC-0092, PC-0094, PC-0115, PC-0118, PC-0120, PC-0132
   ... and 603 more PCs
Total accesses: 2722

User: BAL0044
PCs accessed (607): PC-0039, PC-0044, PC-0072, PC-0094, PC-0118, PC-0120, PC-0146, PC-0148, PC-0151, PC-0166
   ... and 597 more PCs
Total accesses: 2258

User: EIS0041
PCs accessed (607): PC-0004, PC-0092, PC-0118, PC-0120, PC-0141, PC-0146, PC-0148, PC-0151, PC-0164, PC-0166
   ... and 597 more PCs
Total accesses: 2644

User: IBB0359
PCs accessed (580): PC-0004, PC-0008, PC-0072, PC-0094, PC-0115, PC-0120, PC-0133, PC-0141, PC-0146, PC-01

In [None]:
# SEARCH FUNCTIONS
def find_users_for_pc(pc_name):
    """Find all users who have accessed a specific PC"""
    result = pc_to_users[pc_to_users['pc'] == pc_name]
    if len(result) == 0:
        print(f"PC '{pc_name}' not found in the system")
        return None

    row = result.iloc[0]
    print(f"PC: {pc_name}")
    print(f"Users with access ({len(row['users_with_access'])}):")
    for i, user in enumerate(row['users_with_access'], 1):
        print(f"   {i:2d}. {user}")
    print(f"Total accesses: {row['total_accesses']}")
    return row['users_with_access']

def find_pcs_for_user(username):
    """Find all PCs that a specific user has accessed"""
    result = user_to_pcs[user_to_pcs['user'] == username]
    if len(result) == 0:
        print(f"User '{username}' not found in the system")
        return None

    row = result.iloc[0]
    print(f"User: {username}")
    print(f"PCs accessed ({len(row['pcs_accessed'])}):")
    for i, pc in enumerate(row['pcs_accessed'], 1):
        print(f"   {i:2d}. {pc}")
    print(f"Total accesses: {row['total_accesses']}")
    return row['pcs_accessed']

In [None]:
# EXAMPLE SEARCHES
print("3. EXAMPLE SEARCHES")
print("="*50)

# Example: Search for specific PCs
example_pcs = ["PC-3471", "PC-7117"]######################################

print("Searching for specific PCs:")
for pc in example_pcs:
    print(f"\n--- Results for {pc} ---")
    find_users_for_pc(pc)

print("\n" + "-"*80)

# Example: Search for specific users
example_users = ["CJM0273", "RAA1455"]#########################################

print("Searching for specific users:")
for user in example_users:
    print(f"\n--- Results for {user} ---")
    find_pcs_for_user(user)

print("\n" + "="*80 + "\n")

3. EXAMPLE SEARCHES
Searching for specific PCs:

--- Results for PC-3471 ---
PC 'PC-3471' not found in the system

--- Results for PC-7117 ---
PC 'PC-7117' not found in the system

--------------------------------------------------------------------------------
Searching for specific users:

--- Results for CJM0273 ---
User 'CJM0273' not found in the system

--- Results for RAA1455 ---
User 'RAA1455' not found in the system




In [None]:
# SECURITY ANALYSIS
print("4. SECURITY ANALYSIS")
print("="*50)

# Find potentially risky PCs (accessed by many users)
risky_pcs = pc_to_users[pc_to_users['unique_user_count'] >= 10]
print(f"Potentially risky PCs (≥10 users): {len(risky_pcs)}")
if len(risky_pcs) > 0:
    print("Top 5 most shared PCs:")
    for idx, row in risky_pcs.head(5).iterrows():
        print(f"   {row['pc']}: {row['unique_user_count']} users, {row['total_accesses']} total accesses")

print()

# Find potentially privileged users (access to many PCs)
privileged_users = user_to_pcs[user_to_pcs['unique_pc_count'] >= 10]
print(f"Potentially privileged users (≥10 PCs): {len(privileged_users)}")
if len(privileged_users) > 0:
    print("Top 5 users with broadest access:")
    for idx, row in privileged_users.head(5).iterrows():
        print(f"   {row['user']}: {row['unique_pc_count']} PCs, {row['total_accesses']} total accesses")

print()

# Users with access to only 1 PC (potentially dedicated/restricted users)
restricted_users = user_to_pcs[user_to_pcs['unique_pc_count'] == 1]
print(f"Restricted users (only 1 PC): {len(restricted_users)}")
if len(restricted_users) > 0:
    for idx, row in restricted_users.head(5).iterrows():
        print(f"   {row['user']}: {row['unique_pc_count']} PC, {row['total_accesses']} total accesses")

# PCs accessed by only 1 user (potentially personal/dedicated PCs)
personal_pcs = pc_to_users[pc_to_users['unique_user_count'] == 1]
print(f"Personal/dedicated PCs (only 1 user): {len(personal_pcs)}")
if len(personal_pcs) > 0:
    for idx, row in personal_pcs.head(5).iterrows():
        print(f"   {row['pc']}: {row['unique_user_count']} user, {row['total_accesses']} total accesses")

print("\n" + "="*80 + "\n")

4. SECURITY ANALYSIS
Potentially risky PCs (≥10 users): 11
Top 5 most shared PCs:
   PC-5866: 12 users, 30 total accesses
   PC-3476: 10 users, 761 total accesses
   PC-8734: 10 users, 765 total accesses
   PC-8245: 10 users, 5989 total accesses
   PC-7384: 10 users, 1432 total accesses

Potentially privileged users (≥10 PCs): 10
Top 5 users with broadest access:
   AJF0370: 642 PCs, 8502 total accesses
   HDS0367: 613 PCs, 2722 total accesses
   BAL0044: 607 PCs, 2258 total accesses
   EIS0041: 607 PCs, 2644 total accesses
   IBB0359: 580 PCs, 7852 total accesses

Restricted users (only 1 PC): 249
   NGF0157: 1 PC, 284 total accesses
   MDH0580: 1 PC, 1257 total accesses
   MCF0600: 1 PC, 72 total accesses
   NKP0236: 1 PC, 246 total accesses
   MCD0125: 1 PC, 132 total accesses
Personal/dedicated PCs (only 1 user): 31
   PC-2344: 1 user, 3716 total accesses
   PC-6377: 1 user, 2 total accesses
   PC-2725: 1 user, 2 total accesses
   PC-2524: 1 user, 2 total accesses
   PC-5820: 1 use

In [None]:
# SUMMARY STATISTICS
print("5. SUMMARY STATISTICS")
print("="*50)

print("PC Access Distribution:")
pc_stats = pc_to_users['unique_user_count'].describe()
print(f"   Mean users per PC: {pc_stats['mean']:.1f}")
print(f"   Median users per PC: {pc_stats['50%']:.1f}")
print(f"   Most shared PC: {pc_stats['max']:.0f} users")
print(f"   Least shared PC: {pc_stats['min']:.0f} user(s)")

print("\nUser Access Distribution:")
user_stats = user_to_pcs['unique_pc_count'].describe()
print(f"   Mean PCs per user: {user_stats['mean']:.1f}")
print(f"   Median PCs per user: {user_stats['50%']:.1f}")
print(f"   User with broadest access: {user_stats['max']:.0f} PCs")
print(f"   User with narrowest access: {user_stats['min']:.0f} PC(s)")

5. SUMMARY STATISTICS
PC Access Distribution:
   Mean users per PC: 5.5
   Median users per PC: 6.0
   Most shared PC: 12 users
   Least shared PC: 1 user(s)

User Access Distribution:
   Mean PCs per user: 20.2
   Median PCs per user: 1.0
   User with broadest access: 642 PCs
   User with narrowest access: 1 PC(s)


In [None]:
print("=== USER-PC ACCESS ANALYSIS ===\n")

# === 1. ALL USERS WHO HAVE ACCESS TO EACH PC ===
print("1. ALL USERS WHO HAVE ACCESS TO EACH PC")
print("="*50)

# Group by PC and get list of all users who accessed it
pc_to_users = df.groupby('pc').agg({
    'user': lambda x: sorted(list(set(x))),  # Unique users, sorted
    'id': 'count'  # Total access count for this PC
}).reset_index()

pc_to_users.columns = ['pc', 'users_with_access', 'total_accesses']
pc_to_users['unique_user_count'] = pc_to_users['users_with_access'].apply(len)

# Filter out PCs accessed by only one user
pc_to_users = pc_to_users[pc_to_users['unique_user_count'] > 1]

# Sort by number of users (most shared PCs first)
pc_to_users = pc_to_users.sort_values('unique_user_count', ascending=False)

print(f"PCs with multiple users: {len(pc_to_users)}")
print(f"Most shared PC has {pc_to_users['unique_user_count'].max()} different users")
print(f"Least shared PC has {pc_to_users['unique_user_count'].min()} different users")

# === 2. ALL PCs THAT EACH USER TOUCHES ===
print("2. ALL PCs THAT EACH USER TOUCHES")
print("="*50)

# Group by User and get list of all PCs they accessed
user_to_pcs = df.groupby('user').agg({
    'pc': lambda x: sorted(list(set(x))),  # Unique PCs, sorted
    'id': 'count'  # Total access count for this user
}).reset_index()

user_to_pcs.columns = ['user', 'pcs_accessed', 'total_accesses']
user_to_pcs['unique_pc_count'] = user_to_pcs['pcs_accessed'].apply(len)

# Filter out users who access only one PC
user_to_pcs = user_to_pcs[user_to_pcs['unique_pc_count'] > 1]

# Sort by number of PCs accessed (users with broadest access first)
user_to_pcs = user_to_pcs.sort_values('unique_pc_count', ascending=False)

print(f"Users with access to multiple PCs: {len(user_to_pcs)}")
print(f"User with broadest access touches {user_to_pcs['unique_pc_count'].max()} different PCs")
print(f"User with narrowest access touches {user_to_pcs['unique_pc_count'].min()} different PCs")


=== USER-PC ACCESS ANALYSIS ===

1. ALL USERS WHO HAVE ACCESS TO EACH PC
PCs with multiple users: 940
Most shared PC has 12 different users
Least shared PC has 2 different users
2. ALL PCs THAT EACH USER TOUCHES
Users with access to multiple PCs: 16
User with broadest access touches 642 different PCs
User with narrowest access touches 2 different PCs


In [None]:
users_with_after_hours = user_summary[user_summary['has_after_hours'] == True]['user'].unique()

user_to_pcs['has_after_hours'] = user_to_pcs['user'].isin(users_with_after_hours)


# add psychometric
psychometric_data = pd.read_csv('/psychometric.csv')
# user_to_pcs['O'] = user_to_pcs['user'].isin(psychometric_data['user_id'])
user_to_pcs = user_to_pcs.merge(
    psychometric_data[['user_id', 'O', 'C', 'E', 'A', 'N']],
    left_on='user',
    right_on='user_id',
    how='left'
)
user_to_pcs['OCEAN'] = user_to_pcs['O'] + user_to_pcs['C'] + user_to_pcs['E'] + user_to_pcs['A'] + user_to_pcs['N']
user_to_pcs.drop(columns=['user_id', 'O', 'C', 'E', 'A', 'N'], inplace=True)

In [None]:
def categorize(value, limit):
    if value < limit:
        return 'Low'
    elif value == limit:
        return 'Medium'
    else:
        return 'High'

# def categorize(value, limit):
#     return 'Low' if value < limit else 'Medium' if value == limit else 'High'

# Apply to each column
user_to_pcs['OCEAN_level'] = user_to_pcs['OCEAN'].apply(lambda x: categorize(x, 125))
# user_to_pcs['O'] = user_to_pcs['O'].apply(lambda x: categorize(x, 25))
# user_to_pcs['C'] = user_to_pcs['C'].apply(lambda x: categorize(x, 25))
# user_to_pcs['E'] = user_to_pcs['E'].apply(lambda x: categorize(x, 25))
# user_to_pcs['A'] = user_to_pcs['A'].apply(lambda x: categorize(x, 25))
# user_to_pcs['N'] = user_to_pcs['N'].apply(lambda x: categorize(x, 25))

In [None]:
user_to_pcs.to_csv("device_user_to_pcs.csv", index=False)

In [None]:
pc_to_users.to_csv("device_pc_to_users.csv", index=False)

In [None]:
print('DONE!!')

DONE!!


In [None]:
# 16 users with access to a total of 940 PCs
# 16 out of 265 users and 940 out of 971 PCs
# Some pcs accessed outside of work hours
######################################################################