In [2]:
from pathlib import Path

def read_and_sort_concerts(file_path):
    concerts = {}
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#') and line.strip():
                genre, count = line.split(':')
                concerts[genre.strip()] = int(count.strip())
    
    sorted_concerts = sorted(concerts.items(), key=lambda item: item[1], reverse=True)
    return sorted_concerts

file_path = Path.cwd().parent /'Data'/'grupee_data'/'n_concerts.txt'
sorted_concerts = read_and_sort_concerts(file_path)
for genre, count in sorted_concerts:
    print(f"{genre}: {count}")

Spirituality & Religion: 60
International Pop: 40
Indie Rock/Rock pop: 38
Dancefloor: 36
Singer & Songwriter: 30
Indie Pop: 30
Kids & Family: 30
Pop: 29
Techno/House: 29
Rap/Hip Hop: 28
Contemporary R&B: 27
Indie Rock: 25
Classical: 24
Electro Pop/Electro Rock: 24
R&B: 24
Folk: 23
Vocal jazz: 23
Romantic: 22
Dubstep: 22
Dub: 22
Electro: 20
Indie Pop/Folk: 18
Alternative: 18
Hard Rock: 18
Metal: 15
Rock: 14
Blues: 14
Electro Hip Hop: 13
Country Blues: 13
East Coast: 13
Comedy: 12
West Coast: 12
Jazz: 12
Trance: 11
Musicals: 10
Dancehall/Ragga: 10
Latin Music: 10
Old school soul: 9
Contemporary Soul: 9
Classic Blues: 9
Jazz Hip Hop: 8
Instrumental jazz: 8
Dance: 8
Soul & Funk: 8
Alternative Country: 6
Rock & Roll/Rockabilly: 6
Reggae: 5
Asian Music: 4
African Music: 4
Opera: 4
Chill Out/Trip-Hop/Lounge: 3
Kids: 3
Bluegrass: 2
Acoustic Blues: 2
Traditional Country: 2
Country: 2
Baroque: 1
Bolero: 1
Indian Music: 1
Electric Blues: 1
Soundtracks: 0
Disco: 0
Old School: 0
Urban Cowboy: 0
Tro

In [19]:
from utility import graph
import json
import csv
import numpy as np
from alive_progress import alive_bar
from pathlib import Path

CONCERT_POP_FEATURE = False

current_path = Path.cwd().parent 
concerts = graph.read_concerts(current_path /'Data'/'grupee_data'/'n_concerts.txt')
# Perform the division and multiplication while maintaining the tuple structure
concert_per_two_weeks_scaler = [((vc[1] / 52.1429) * 2.0) for vc in concerts]
# visualizations.visualize_concerts(concert_per_two_weeks, "two weeks")
# print("visit per two weeks", concert_per_two_weeks)

# Extract the preferences of the grupees
preferences_path = current_path /'Data'/'grupee_data'/'preferences.json'
with open(preferences_path, 'r') as file:
    preferences = json.load(file)

# Read the connections between grupees
friend_pairs = []
friends_path = current_path /'Data'/'grupee_data'/'friends.csv'
with open(friends_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        friend_pairs.append(row)
    friend_pairs.pop(0) # first line contains an unnecessary comment

# probability to infect a friend depending on preferences
both_like = 393/1000
one_like = 18/1000
neither_like = 2/1000
genre_count = len(preferences[friend_pairs[0][0]]) 
risk_per_id = [[0] * genre_count for _ in range(len(preferences.keys()))]
num_friends_per_id = [0] * len(preferences.keys())
with alive_bar(len(friend_pairs), title='Processing Friend IDs') as bar:
    for f_id in friend_pairs: # f_id is a list of two friend ids
        risk_per_id_row = [0] * genre_count
        pref_1 = preferences[f_id[0]]
        pref_2 = preferences[f_id[1]]
        num_friends_per_id[int(f_id[0])] += 1
        num_friends_per_id[int(f_id[1])] += 1
        for i, (p1, p2) in enumerate(zip(pref_1, pref_2)): # iterate over the prefereed genres of the two friends
            if p1 == '1' and p2 == '1':
                risk_per_id_row[i] += both_like
            elif p1 == '1' or p2 == '1':
                risk_per_id_row[i] += one_like
            else:
                risk_per_id_row[i] += neither_like
        # Element wise addition of the risk per genre
        risk_per_id[int(f_id[0])] = [x + y for x, y in zip(risk_per_id[int(f_id[0])], risk_per_id_row)]
        risk_per_id[int(f_id[1])] = [x + y for x, y in zip(risk_per_id[int(f_id[1])], risk_per_id_row)]
        bar()

if CONCERT_POP_FEATURE:
    # TAKE CONCERT POPULATION AS FEATURE
    risk_per_id = np.array(risk_per_id)
    # Sum the elements element-wise to get a 1D array of length 84
    sum_concert_risk_per_id = np.sum(risk_per_id, axis=0)
    # Divide each element by 2
    min_val = np.min(sum_concert_risk_per_id)
    max_val = np.max(sum_concert_risk_per_id)
    normalized_sum_concert_risk_per_id = (sum_concert_risk_per_id - min_val) / (max_val - min_val)
    normalized_sum_concert_risk_per_id.tolist()
    risk_per_id = [[x * y for x, y in zip(row, normalized_sum_concert_risk_per_id)] for row in risk_per_id] 

# SCALE RISK PER ID BY CONCERTS PER TWO WEEKS    
concert_per_two_weeks_scaler = np.array(concert_per_two_weeks_scaler)
min_val = np.min(concert_per_two_weeks_scaler)
max_val = np.max(concert_per_two_weeks_scaler)
normalized_concert_per_two_weeks_scaler = (concert_per_two_weeks_scaler - min_val) / (max_val - min_val)
normalized_concert_per_two_weeks_scaler.tolist()
scaled_risk_per_id = [[x * y for x, y in zip(row, normalized_concert_per_two_weeks_scaler)] for row in risk_per_id] # theoretically could scale by concerts instead

# Sum the elements of each row in scaled_risk_per_id
sum_scaled_risk_per_id = [sum(row) for row in scaled_risk_per_id]
# Assuming sum_scaled_risk_per_id is already defined
test = np.array(sum_scaled_risk_per_id)

# Get the maximum value
max_value = np.max(test)

# Get the index of the maximum value
max_index = np.argmax(test)

print(f"Maximum value: {max_value}")
print(f"Index of maximum value: {max_index}")
# Calculate the number of top elements to select (12% of the total length)
top_percentage = 0.12
num_top_elements = int(len(sum_scaled_risk_per_id) * top_percentage)

# Get the indices of the top elements
top_indices = np.argsort(sum_scaled_risk_per_id)[-num_top_elements:][::-1]
# Get the values of the top elements
top_values = [sum_scaled_risk_per_id[i] for i in top_indices]

print("Top 12% indices:", top_indices)
print("Top 12% values:", top_values)
print("number 1:", top_indices[0])
print("value number 1:", sum_scaled_risk_per_id[top_indices[0]])
print("value number 1:", top_values[0])
"""
# Save the top_indices to a text file
output_file_path = Path.cwd() / 'b_team_8.txt'
with output_file_path.open('w') as file:
    for index in top_indices:
        file.write(f"{index}\n")
"""        
# top 12% of the riskiest people to be affected by the virus (due to visitation of concerts with friends)
# FROM HERE ON: 
# top_indices: list of ids of the top 12% of the riskiest people to be affected by the virus (sorted by risk)
# top_values: list of the corresponding risks of the top 12% of the riskiest people to be affected by the virus
# risk_per_id: list of lists of risks per genre for each grupee
# scaled_risk_per_id: list of lists of scaled risks per genre for each grupee (scaled by number of concerts per two weeks)
# num_friends_per_id: list of the number of friends for each grupee
# concerts: list of tuples of genres and the number of concerts for each genre
# concert_per_two_weeks_scaler: list of the number of concerts per two weeks for each genre (might be useful for simulation)


Processing Friend IDs |████████████████████████████████████████| 55483/55483 [100%] in 2.5s (22554.28/s) 
Maximum value: 45.059816666666684
Index of maximum value: 1612
Top 12% indices: [1612  419 1210  312  506  294 1197  108 3764 1315  715 2435 1687  587
  160 1974  327 4011 2595  694 3664 2654 7233  855 4207 1805  446 2242
 1923 7296 6678  492 1947  725  600 1658  260 4024 1837  269 6005  366
 3578  904 5569 4352 2237 3322  532 8201  416  554 4726  588 2020   48
 4017  615 3660  552 2894  444  168 2408  590  300  279  177 1305 4022
 4013 7273   17  942 7060    2 1886  114 1806 2117 4935  480  384  349
 1059 5139  430 3498 1484  768  342  328  391 1526 1709  544 2788  468
 1504 2004 3417  985 4006  102 5486 5797 7392 6568 7359 3738 3495 8095
  306  443 2003  249  101  243 5254  320  555 5560  182  388 4977  284
 4702 2010 4060 5775 2244 4530 4113  621  811  118  104 1877 2898 7871
  365  909 2604  252 1304 7278 1408 4786 1316 4761 3081 5443  520  149
 7261 1232 7258 1530  363  598 45

'\n# Save the top_indices to a text file\noutput_file_path = Path.cwd() / \'b_team_8.txt\'\nwith output_file_path.open(\'w\') as file:\n    for index in top_indices:\n        file.write(f"{index}\n")\n'

In [None]:
from utility import graph
import json
import csv
import numpy as np
from alive_progress import alive_bar
from pathlib import Path

CONCERT_POP_FEATURE = False

current_path = Path.cwd().parent 
concerts = graph.read_concerts(current_path /'Data'/'grupee_data'/'n_concerts.txt')
# Perform the division and multiplication while maintaining the tuple structure
concert_per_two_weeks_scaler = [((vc[1] / 52.1429) * 2.0) for vc in concerts]
# visualizations.visualize_concerts(concert_per_two_weeks, "two weeks")
# print("visit per two weeks", concert_per_two_weeks)

# Extract the preferences of the grupees
preferences_path = current_path /'Data'/'grupee_data'/'preferences.json'
with open(preferences_path, 'r') as file:
    preferences = json.load(file)

# Read the connections between grupees
friend_pairs = []
friends_path = current_path /'Data'/'grupee_data'/'friends.csv'
with open(friends_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        friend_pairs.append(row)
    friend_pairs.pop(0) # first line contains an unnecessary comment

# probability to infect a friend depending on preferences
both_like = 393/1000
one_like = 18/1000
neither_like = 2/1000
genre_count = len(preferences[friend_pairs[0][0]]) 
removed_nodes = []
print("Number of grupees:", len(preferences))
num_to_vaccinate = int(len(preferences) * 0.12)

with alive_bar(num_to_vaccinate, title='Finding highest risks') as bar:
    for i in range(num_to_vaccinate):
        risk_per_id = [[0] * genre_count for _ in range(len(preferences.keys()))]
        num_friends_per_id = [0] * len(preferences.keys())
        #print("removed nodes:", removed_nodes)
        for f_id in friend_pairs: # f_id is a list of two friend ids
            if int(f_id[0]) in removed_nodes or int(f_id[1]) in removed_nodes:
                continue
            risk_per_id_row = [0] * genre_count
            pref_1 = preferences[f_id[0]]
            pref_2 = preferences[f_id[1]]
            num_friends_per_id[int(f_id[0])] += 1
            num_friends_per_id[int(f_id[1])] += 1
            for i, (p1, p2) in enumerate(zip(pref_1, pref_2)): # iterate over the prefereed genres of the two friends
                if p1 == '1' and p2 == '1':
                    risk_per_id_row[i] += both_like
                elif p1 == '1' or p2 == '1':
                    risk_per_id_row[i] += one_like
                else:
                    risk_per_id_row[i] += neither_like
            # Element wise addition of the risk per genre
            risk_per_id[int(f_id[0])] = [x + y for x, y in zip(risk_per_id[int(f_id[0])], risk_per_id_row)]
            risk_per_id[int(f_id[1])] = [x + y for x, y in zip(risk_per_id[int(f_id[1])], risk_per_id_row)]
            

        if CONCERT_POP_FEATURE:
            # TAKE CONCERT POPULATION AS FEATURE
            risk_per_id = np.array(risk_per_id)
            # Sum the elements element-wise to get a 1D array of length 84
            sum_concert_risk_per_id = np.sum(risk_per_id, axis=0)
            # Divide each element by 2
            min_val = np.min(sum_concert_risk_per_id)
            max_val = np.max(sum_concert_risk_per_id)
            normalized_sum_concert_risk_per_id = (sum_concert_risk_per_id - min_val) / (max_val - min_val)
            normalized_sum_concert_risk_per_id.tolist()
            risk_per_id = [[x * y for x, y in zip(row, normalized_sum_concert_risk_per_id)] for row in risk_per_id] 

        # SCALE RISK PER ID BY CONCERTS PER TWO WEEKS    
        concert_per_two_weeks_scaler = np.array(concert_per_two_weeks_scaler)
        min_val = np.min(concert_per_two_weeks_scaler)
        max_val = np.max(concert_per_two_weeks_scaler)
        normalized_concert_per_two_weeks_scaler = (concert_per_two_weeks_scaler - min_val) / (max_val - min_val)
        normalized_concert_per_two_weeks_scaler.tolist()
        scaled_risk_per_id = [[x * y for x, y in zip(row, normalized_concert_per_two_weeks_scaler)] for row in risk_per_id] # theoretically could scale by concerts instead

        # Sum the elements of each row in scaled_risk_per_id
        sum_scaled_risk_per_id = [sum(row) for row in scaled_risk_per_id]
        # Assuming sum_scaled_risk_per_id is already defined
        test = np.array(sum_scaled_risk_per_id)

        # Get the maximum value
        max_value = np.max(test)

        # Get the index of the maximum value
        max_index = np.argmax(test)

        # print(f"Maximum value: {max_value}")
        # print(f"Index of maximum value: {max_index}")
        # Calculate the number of top elements to select (12% of the total length)
        top_percentage = 0.12
        num_top_elements = int(len(sum_scaled_risk_per_id) * top_percentage)

        # Get the indices of the top elements
        top_indices = np.argsort(sum_scaled_risk_per_id)[-num_top_elements:][::-1]
        # Get the values of the top elements
        top_values = [sum_scaled_risk_per_id[i] for i in top_indices]
        top_indice = top_indices[0]
        removed_nodes.append(int(top_indice))
        #print("Top 12% indices:", top_indices)
        #print("Top 12% values:", top_values)
        #print("number 1:", top_indice)
        #print("value number 1:", sum_scaled_risk_per_id[top_indice])
        #print("value number 1:", top_values[0])
        bar()

print("removed nodes:", removed_nodes)

# Save the top_indices to a text file
output_file_path = Path.cwd() / 'new_a_team_8.txt'
with output_file_path.open('w') as file:
    for index in removed_nodes:
        file.write(f"{index}\n")
       
# top 12% of the riskiest people to be affected by the virus (due to visitation of concerts with friends)
# FROM HERE ON: 
# top_indices: list of ids of the top 12% of the riskiest people to be affected by the virus (sorted by risk)
# top_values: list of the corresponding risks of the top 12% of the riskiest people to be affected by the virus
# risk_per_id: list of lists of risks per genre for each grupee
# scaled_risk_per_id: list of lists of scaled risks per genre for each grupee (scaled by number of concerts per two weeks)
# num_friends_per_id: list of the number of friends for each grupee
# concerts: list of tuples of genres and the number of concerts for each genre
# concert_per_two_weeks_scaler: list of the number of concerts per two weeks for each genre (might be useful for simulation)


Number of grupees: 8311
Finding highest risks |▏⚠︎                                      | (!) 2/997 [0%] in 7.4s (0.27/s) 


KeyboardInterrupt: 

In [None]:
import networkx as nx
# get covered vertices of friend pairs depending on the top_indices
def load_friends():
    data = np.loadtxt(current_path/"Data"/"grupee_data"/"friends.csv", delimiter=',', dtype=int)
    
    return data
def count_covered_nodes(G, selected_nodes):
    covered_nodes = set(selected_nodes)
    
    for node in selected_nodes:
        neighbors = list(G.neighbors(node))
        covered_nodes.update(neighbors)
    
    return len(covered_nodes)
friends = load_friends()
def load_vaccinated(file):
    with open(file) as file:
        return [int(a) for a in file.read().split()]
    
g = nx.Graph()
for edge in friends:
    g.add_edge(edge[0], edge[1])
count_covered_nodes(g, top_indices)

6899