In [1]:
import pickle
import os
from collections import defaultdict, Counter
import networkx as nx
import pandas as pd
from tqdm.notebook import tqdm
import statsmodels.api as sm
import sys

tqdm.pandas()

In [2]:
def process_total_visits(G, keep = 100):
    c = Counter()
    fit_name = 'Fitness and Recreational Sports Centers'
    edges = G.edges(data=True)
    for e in edges:
        if (e[0] == fit_name) or (e[1] == fit_name):
            weight = e[2]["weight"]
            if (e[0] == fit_name):
                c.update({e[1]: weight})
            else:
                c.update({e[0]: weight})

    df = pd.DataFrame.from_dict(c.most_common())
    df.columns = ["sub_category", "co_visits_with_fitness"]

    df = df[:keep]
    df["percentage"] = df["co_visits_with_fitness"] / df["co_visits_with_fitness"].sum()
    return df

In [3]:
def compare_two_dfs(df_1, df_2):
    result = pd.merge(df_1, df_2, how = "inner", left_on = "sub_category", right_on = "sub_category", suffixes=("_weekday", "_weekend"))
    result["diff"] = result["percentage_weekday"] - result["percentage_weekend"]
    print("Exercisers prefer to visit the below on weekends")
    print(result.sort_values(by = "diff", ascending = True)[["sub_category", "percentage_weekday", "percentage_weekend", "diff"]][:10])
    print("Exercisers prefer to visit the below on weekdays")
    print(result.sort_values(by = "diff", ascending = False)[["sub_category", "percentage_weekday", "percentage_weekend", "diff"]][:10])

In [4]:
def insert_conf_intervals(df, method='binom_test'):
    df["95% CI"] = df.progress_apply(lambda row: conf_interval(
        row[1] * row["total co-visits of subcategory"],
        row["total co-visits of subcategory"],
        method = method), axis=1
    )

    return df

In [5]:
def conf_interval(total_true, total_population, a = 0.05, method = 'normal'):
    return sm.stats.proportion_confint(total_true, total_population, a, method)

In [6]:
# Reduce the output by removing any categories with fewer than n total co-occurences
def sub_sample_given_min(df, n):
    df.drop(df[df["total co-visits of subcategory"] < n].index, inplace=True)
    df.reset_index(inplace=True, drop=True)

In [7]:
def create_total_dictionary(year, month, days):
    total = defaultdict(int)
    for day in days:
        covisits_dict = pickle.load(open(f"/home/george/code/Fitness/metadata/covisits/{year}_{month}_{day}.pkl", "rb"))
        total = {i: total.get(i, 0) + covisits_dict.get(i, 0)
             for i in set(total).union(covisits_dict)}
    return total

In [8]:
def create_total_graph(total):
    input_to_graph = []
    for k, v in total.items():
        locs = k.split("+")
        input_to_graph.append((locs[0], locs[1], v))
    G = nx.Graph()
    G.add_weighted_edges_from(input_to_graph)
    return G

In [9]:
def create_importance_graph(G, total):
    directed = nx.DiGraph()
    for node in G.nodes(): # for each node in the initial co-occurence graph
        total = 0
        for i,j,w in G.edges(node, data=True):
            total += w['weight']
        for i,j,w in G.edges(node, data=True):
            directed.add_weighted_edges_from([(i, j, w['weight']/total)])
    return directed

In [10]:
# Edges coming into Fitness, are how important is Fitness to those locations
def create_importance_df(G, directed):
    importance_of_fitness = directed.in_edges('Fitness and Recreational Sports Centers', data=True)
    importance_of_fitness = list(importance_of_fitness)
    importance = []
    for i,j,w in importance_of_fitness:
        importance.append([i,j,w['weight']])

    importance = sorted(importance, key= lambda x:x[2], reverse=True)
    # Let's do it in a nicer print
    importance = [ [x[0],x[2]] for x in importance]
    importance = pd.DataFrame(importance)
    importance.columns = ["subcategory", "importance"]
    # Append in this the total number of co-occurences of these categories
    total_co_occurences = []
    for i, row in importance.iterrows():
        category_name = row[0]
        total = G.degree(weight='weight')[category_name]
        total_co_occurences += [total]
    importance.insert(2, "total co-visits of subcategory", total_co_occurences)
    importance_conf = importance.copy()
    importance_conf = insert_conf_intervals(importance_conf, method = "normal")
    # Sub-sample given a manually chosen minimum value
    return importance_conf

In [11]:
# This works for undirected graphs - it removes the duplicated common edge from the sum
def co_occurences_of_n_most_common(G, n):
    node_weights = defaultdict(int)
    for node in G.nodes():
        node_weight = 0
        for i,j,w in G.edges(node, data=True):
            weight = w['weight']
            node_weight += weight
        node_weights[i] = node_weight
    node_weights = Counter(node_weights).most_common()[:n]
    print(node_weights)
    n_locations = [loc for loc, co in node_weights]

    visited = set()
    total = 0
    for i,j,w in G.edges(n_locations, data=True):
        visited.add(i)
        if j not in visited: # so that we do not recount the edges that were already counted
            total += w['weight']
    return total

In [12]:
# Retuns the rank of location and the co-visits to that location
def find_rank_of_location(node_weights, location):
    for i, loc in enumerate(node_weights.most_common()):
        if location == loc[0]:
            return (i+1, loc[1])

In [13]:
def get_co_visits_of_subcategory(G, subcategory, weekday_boolean):
    total = 0
    for i,j,w in G.edges(subcategory, data=True):
        total += w['weight']
    if weekday_boolean:
        averaged_total = total/5
    else:
        averaged_total = total/2
    print(averaged_total)

In [14]:
year = "2019"
month = "dec"
total_weekdays = create_total_dictionary(year, month, ["02", "03", "04", "05", "06"])
total_weekends = create_total_dictionary(year, month, ["07", "08"])
G_weekdays = create_total_graph(total_weekdays)
G_weekends = create_total_graph(total_weekends)

In [15]:
node_weights = defaultdict(int)
for node in G_weekdays.nodes():
    node_weight = 0
    for i,j,w in G_weekdays.edges(node, data=True):
        weight = w['weight']
        node_weight += weight
    node_weights[i] = node_weight
node_weights = Counter(node_weights)

In [18]:
df_weekdays = process_total_visits(G_weekdays)
df_weekends = process_total_visits(G_weekends)
directed_weekdays = create_importance_graph(G_weekdays, total_weekdays)
directed_weekends = create_importance_graph(G_weekends, total_weekends)
importance_weekdays = create_importance_df(G_weekdays, directed_weekdays)
importance_weekends = create_importance_df(G_weekends, directed_weekends)

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/284 [00:00<?, ?it/s]

In [19]:
sub_sample_given_min(importance_weekdays, 50000)
sub_sample_given_min(importance_weekends, 10000)

In [20]:
print("Importance given on weekdays from other subcategories:")
print(importance_weekdays[:30])
print("Importance given on weekends from other subcategories:")
print(importance_weekends[:30])

Importance given on weekdays from other subcategories:
                                          subcategory  importance  \
0                       Exam Preparation and Tutoring    0.068315   
1                Snack and Nonalcoholic Beverage Bars    0.062043   
2                                   Investment Advice    0.060982   
3                            Full-Service Restaurants    0.060948   
4              Musical Instrument and Supplies Stores    0.060657   
5       All Other Amusement and Recreation Industries    0.059775   
6         Nature Parks and Other Similar Institutions    0.059346   
7                      Golf Courses and Country Clubs    0.059219   
8                              Libraries and Archives    0.057791   
9                       Photography Studios, Portrait    0.057611   
10                                          Breweries    0.057571   
11                         Commercial Screen Printing    0.056866   
12                        Pet and Pet Supplies S