In [None]:
import dateutil.parser
import csv
import sys


maximum_look_up_time_window = 5 * 60

#file_path = "LiveLab_top_15_MATHia_7x_classes_anon.csv"

test_file_path = 'test-changed.csv'
search_file_path = 'Analysis-critical_struggle-x890834_output.txt'#Change to target file

# the events from search_file should happens no later than the test_file.

def process_search_actions(search_file_path):
    search_event_map = {}
    count = 0

    # Process events from search file.
    with open(search_file_path, 'r') as file:
        for line in file:
            count += 1
            if count == 1:
                continue

            line = line.strip()
            tokens = line.split('\t')
            
            student_id = tokens[0]
            if student_id not in search_event_map:
                search_event_map[student_id] = {}

            # Check if the timestamp is valid (not 'null' or empty)
            timestamp_str = tokens[1]
            if timestamp_str != 'null' and timestamp_str:
                try:
                    timestamp = dateutil.parser.isoparse(timestamp_str).replace(tzinfo=None)
                except ValueError:
                    print(f"Skipping invalid timestamp: {timestamp_str}")
                    continue
            else:
                print(f"Skipping invalid timestamp: {line}")
                continue

            # Insert the timestamp and action data into the map.
            search_event_map[student_id][timestamp] = tokens

    return search_event_map

def process_test_events(test_file_path):
	test_event_map = {}
	count = 0
	titles = []

	# process the csv file
	with open(test_file_path) as csvfile:
		test_reader = csv.reader(csvfile)
		for row in test_reader:
			count += 1
			if count == 1:
				titles = row
				continue

			student_id = row[0]
			timestamp_str = row[2]
			if student_id not in test_event_map:
				test_event_map[student_id] = []
			# convert timestamp str to object.
			timestamp = dateutil.parser.parse(timestamp_str)
			test_data = {}
			test_data['id'] = count - 1
			test_data['time'] = timestamp
			test_data['row'] = row

			test_event_map[student_id].append(test_data)
	return test_event_map, titles

def get_time_diff(test_datetime, search_datetime):
	time_diff = (test_datetime - search_datetime).total_seconds()

	return time_diff


def search(search_event_map, test_event_map, titles, output_file_path):
	# find search events with smallest early durations from test events.
	combined_results = {}
	visited_test_data = {}

	for student_id in test_event_map.keys():
		total_size = len(test_event_map[student_id])

		if student_id not in search_event_map:
			# print('no search data under student_id ' + student_id)
			continue
		else:
			total_search_size = len(search_event_map[student_id])
			#print('found total ' + str(total_search_size) + ' search records under student_id ' + student_id)

		combined_results[student_id] = []
		search_events = search_event_map[student_id]
		test_events = test_event_map[student_id]

		for search_datetime in search_events.keys():

			earliest_test_duration = maximum_look_up_time_window
			earliest_test_time = None
			earliest_test_row = None
			earliest_test_event_id = -1

			# for test_data in sorted(test_events, reverse=True):
			for test_event in sorted(test_events, key=lambda x: x['time'], reverse=True):
				test_datetime = test_event['time']
				time_diff = get_time_diff(test_datetime, search_datetime)

				if time_diff < 0:
					break

				if time_diff > earliest_test_duration:
					continue
				
				earliest_test_duration = time_diff
				earliest_test_time = test_datetime
				earliest_test_row = test_event['row']
				earliest_test_event_id = test_event['id']

			if earliest_test_time is not None and earliest_test_row is not None:
				if student_id not in visited_test_data:
					visited_test_data[student_id] = set()
				visited_test_data[student_id].add(earliest_test_event_id)

				# test_event_records = test_events[earliest_test_time]
				search_event_records = search_events[search_datetime]
				# print(search_event_records)
				new_record = earliest_test_row
				new_record.append(search_event_records[1])
				new_record.append(search_event_records[2])
				new_record.append(search_event_records[3])
				# print(new_record)

				combined_results[student_id].append(new_record)

		current_size = str(len(combined_results[student_id]))
		print('find ' + current_size + ' records from student_id ' + student_id)
	# write into csv files.
	with open(output_file_path, 'w') as csvfile:
		writer = csv.writer(csvfile)
		# write titles.
		titles.append('Time')
		titles.append('Detector_Name')
		titles.append('Value')
		writer.writerow(titles)

		for student_id in combined_results.keys():
			for record in combined_results[student_id]:
				writer.writerow(record)

		# check test data and write the un-match set.
		for student_id in test_event_map.keys():
			for test_event in test_event_map[student_id]:
				id = test_event['id']
			# for test_datetime in test_event_map[student_id].keys():
				if student_id in visited_test_data and id in visited_test_data[student_id]:
					print('find matched records from student_id and id', student_id, id)
					continue
				test_event_records = test_event['row']
				test_event_records.append('False')
				test_event_records.append('False')
				test_event_records.append('False')
				writer.writerow(test_event_records)


# process events fro test file and retrieve from search file.
search_event_map = process_search_actions(search_file_path)
test_event_map, titles = process_test_events(test_file_path)

output_file_path = "combined-" + search_file_path
output_file_path = output_file_path[:-3] + 'csv'

# combine search events into the test events, and output with test events.
search(search_event_map, test_event_map, titles, output_file_path)