In [56]:
import json
import re
import numpy as np
import random
import csv
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

# Dict [<id>]= dict[conversation: dict[<id>],
# proximity: dict[<id>],
# attention: dict[<id>],
# time:<a timedelta obj used by the group>
# accuracy: <accuracy used by the group>]
# id = #group_#person
time_acc = {}
data = {}
type_map = {
    'shared': 'attention',
    'proximity': 'proximity',
    'speaking': 'conversation'
}

with open('shared_attention_graphs.json', 'r') as file:
    attention_data = json.load(file)['graphs']

with open('proximity_graphs.json', 'r') as file:
    proximity_data = json.load(file)['graphs']

with open('conversation_graphs.json', 'r') as file:
    conversation_data = json.load(file)['graphs']

def extract_group_number(graph_id):
    match = re.search(r'group-(\d+)', graph_id)
    return int(match.group(1)) if match else float('inf')

def traverse_data (json):
    for block in json:
      info = block['id'].split('-')
      gid = info[1]
      gtype = type_map[info[2]]
      for edge in block['edges']:
        sid = f'{gid}_{edge["source"]}'
        tid = f'{gid}_{edge["target"]}'
        if sid not in data:
          # initialize entry
          data[sid] = {
              'attention': {},
              'proximity': {},
              'conversation': {},
              'time':-1,
              'accuracy':-1,
          }
        if tid not in data:
          # initialize entry
          data[tid] = {
              'attention': {},
              'proximity': {},
              'conversation': {},
              'time':-1,
              'accuracy':-1,
          }
        if not edge['directed']:
          data[tid][gtype][sid] = edge['metadata']['weight']
        data[sid][gtype][tid] = edge['metadata']['weight']
        data[sid]['time']= time_acc[gid][0]
        data[sid]['accuracy']= time_acc[gid][1]

def string_to_timedelta(time_str):
    days, time_part = time_str.split(' days ')
    hours, minutes, seconds,nano = map(float, time_part.replace('.', ':').split(':'))
    return timedelta(days=int(days), hours=int(hours), minutes=int(minutes), seconds=seconds)

def parse_csv(csv_path):
  with open(csv_path, 'r') as file:
      reader = csv.reader(file)
      next(reader)  # Skip the header row
      for row in reader:
        delta = string_to_timedelta(row[1])
        accuracy = float(row[2])
        gid = row[0]
        time_acc[gid] = (delta, accuracy)

parse_csv('completion_time_and_accuracy.csv')
traverse_data(attention_data)
traverse_data(proximity_data)
traverse_data(conversation_data)

In [57]:
print(data["10_A"])

{'attention': {'10_B': 504, '10_D': 196, '10_C': 640}, 'proximity': {'10_B': 328, '10_C': 185, '10_D': 150}, 'conversation': {'10_B': 96.354, '10_C': 96.354, '10_D': 96.354}, 'time': datetime.timedelta(seconds=652), 'accuracy': 50.0}


In [58]:
def give_cluster_env(data):

    kmeans_data = []
    keys = list(data.keys())

    for key in keys:
      if key != "10_B":
        element = data[key]
        attention_mean = np.mean(list(element['attention'].values()))
        proximity_mean = np.mean(list(element['proximity'].values()))
        # proximity_mean = 0
        conversation_mean = np.mean(list(element['conversation'].values()))
        # conversation_mean = 0
        kmeans_data.append([attention_mean, proximity_mean, conversation_mean])

    element = data["10_B"]
    attention_mean = np.mean(list(element['attention'].values()))
    proximity_mean = np.mean(list(element['proximity'].values()))
    # proximity_mean = 0
    conversation_mean = ( np.mean(list(data["10_A"]['conversation'].values())) + np.mean(list(data["10_C"]['conversation'].values()))+np.mean(list(data["10_D"]['conversation'].values()))) / 3
    # conversation_mean = 0
    kmeans_data.append([attention_mean, proximity_mean, conversation_mean])

    return kmeans_data

In [59]:
def dis(x, y):
    return np.sqrt(np.sum((np.array(x) - np.array(y)) ** 2))

In [61]:
'''
def initialize_centers(data, k):
    n_samples, _ = data.shape
    centers = []
    # Step 1: Choose the first center randomly
    centers.append(data[random.randint(0, n_samples - 1)])

    for _ in range(1, k):
        # Step 2: Compute the squared distances from each point to the nearest center
        distances = np.array([
            min(np.sum((x - center) ** 2) for center in centers)
            for x in data
        ])
        # Step 3: Choose the next center with a probability proportional to the squared distances
        probabilities = distances / distances.sum()
        cumulative_probabilities = np.cumsum(probabilities)
        r = random.random()
        for idx, prob in enumerate(cumulative_probabilities):
            if r <= prob:
                centers.append(data[idx])
                break
    return np.array(centers)
'''
def initialize_centers(data, k):
    return random.sample(data.tolist(), k)



In [63]:
def assign_clusters(data, centers):
    cluster_labels = []
    for point in data:
        distances = [dis(point, center) for center in centers]
        cluster_labels.append(np.argmin(distances))
    return cluster_labels

def update_centers(data, cluster_labels, k):
    new_centers = []
    for i in range(k):
        points_in_cluster = [data[j] for j in range(len(data)) if cluster_labels[j] == i]
        if points_in_cluster:
            new_centers.append(np.mean(points_in_cluster, axis=0))
        else:
            new_centers.append(random.choice(data))
    return new_centers

In [64]:
def Kmeans(data, k, iterations):
    centers = initialize_centers(data, k)
    for _ in range(iterations):
        cluster_labels = assign_clusters(data, centers)
        centers = update_centers(data, cluster_labels, k)
    return centers, cluster_labels

def Kmeans_wrapper(data, k, iterations):
    if not data or k <= 0 or iterations <= 0:
        raise ValueError("Invalid input parameters.")
    data_array = np.array(data)
    final_centers, labels = Kmeans(data_array, k, iterations)
    return final_centers, labels

In [65]:
def use_kmeans(data):
    k = 4
    iterations = 500
    final_centers, labels = Kmeans_wrapper(give_cluster_env(data), k, iterations)
    # final_centers, labels = Kmeans_wrapper(data, k, iterations)
    print("Final cluster centers:", final_centers)
    print("Cluster labels:", labels)
    return labels

In [66]:
def count(data):
  count = [0,0,0,0]
  for i in data:
    if i == 0:
      count[0] = count[0]+1
    elif i ==1:
      count[1] = count[1]+1
    elif i ==2:
      count[2] = count[2]+1
    elif i ==3:
      count[3] = count[3]+1
  return count

In [None]:
def find_std(n):
  outputs = []
  for _ in range(n):
    outputs.append(np.sort(count(use_kmeans(data))))
  return np.std(outputs,axis=0)

In [67]:

# for test
# data = [[1, 2, 3, 4], [5, 6, 7, 8], [2, 3, 4, 5], [8, 7, 6, 5], [3, 4, 5, 6]]
outputs = []
for _ in range(30):
  outputs.append(np.sort(count(use_kmeans(data))))
np.std(outputs,axis=0)

def visualization(data,):

Final cluster centers: [array([657.07407407, 153.62962963,  70.97244444]), array([1899.33333333,  450.33333333,   79.03725   ]), array([107.57407407, 142.40740741,  54.41238889]), array([436.41025641, 219.07692308,  79.87023077])]
Cluster labels: [2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 3, 3, 3, 3, 0, 3, 0, 0, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 3]
Final cluster centers: [array([1899.33333333,  450.33333333,   79.03725   ]), array([ 79.64102564, 127.66666667,  54.914     ]), array([536.96825397, 195.53968254,  77.24461905]), array([201.94444444, 171.33333333,  53.4115    ])]
Cluster labels: [3, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 3, 1, 1, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 2, 2]
Final cluster centers: [array([145.78571429, 159.45238095,  64.57314286]), array([41.2       , 91.06666667, 26.0654    ]), array([1899.33333333,  450.33333333,   79.03725   ]), array([536.96825397, 195.53968254,  77.24461905])]
Cluster labe

array([0.54160256, 2.06047459, 1.82452429, 1.39403491])

In [None]:
outputs