In [107]:
!pip install pyvis python-louvain



In [108]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import numpy as np
import torch
from community import community_louvain
from networkx.algorithms.community import girvan_newman
from itertools import islice
import random
import seaborn as sns
import re
from collections import Counter
import ast



# data_path="tripAdvisor/task2_graph.gml"
# parquet_path= "tripAdvisor/task2_df.parquet"

G = nx.read_gml('/kaggle/input/sna-project-files/task2_graph.gml')
df = pd.read_parquet('/kaggle/input/sna-project-files/task2_df.parquet')
print(f"Loaded graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"Loaded DataFrame: {len(df)} rows")

print(G.number_of_nodes())

Loaded graph: 4846 nodes, 11238 edges
Loaded DataFrame: 5000 rows
4846


In [109]:
# df = pd.read_parquet('./task2_df.parquet')
print(df.columns)

Index(['restaurant_link', 'restaurant_name', 'original_location', 'country',
       'region', 'province', 'city', 'address', 'latitude', 'longitude',
       'claimed', 'awards', 'popularity_detailed', 'popularity_generic',
       'top_tags', 'price_level', 'price_range', 'meals', 'cuisines',
       'special_diets', 'features', 'vegetarian_friendly', 'vegan_options',
       'gluten_free', 'original_open_hours', 'open_days_per_week',
       'open_hours_per_week', 'working_shifts_per_week', 'avg_rating',
       'total_reviews_count', 'default_language',
       'reviews_count_in_default_language', 'excellent', 'very_good',
       'average', 'poor', 'terrible', 'food', 'service', 'value', 'atmosphere',
       'keywords'],
      dtype='object')


In [110]:
print(df['popularity_generic'].isna().sum())
print(df['popularity_generic'])
# print(df['total_reviews_count'])

0
91460                      #1 of 3 places to eat in Suzette
701674         #277 of 458 places to eat in Reggio Calabria
389450            #6650 of 10645 places to eat in Barcelona
566304                                                 None
346212    #11 of 23 places to eat in Santa Lucia de Tira...
                                ...                        
604106                   #5 of 6 places to eat in Glynneath
489262                    #17 of 120 places to eat in Neath
772878                  #69 of 148 places to eat in Albenga
40406                 #6671 of 18480 places to eat in Paris
719098                 #3307 of 12913 places to eat in Rome
Name: popularity_generic, Length: 5000, dtype: object


In [111]:
# Step 1: Load required files
centrality_df = pd.read_csv('./task4_centrality.csv')
df = pd.read_parquet('./task2_df.parquet')
print(f"Loaded centrality data: {len(centrality_df)} rows")
print(f"Loaded restaurant data: {len(df)} rows")
print("Columns in restaurant data:", df.columns.tolist())

FileNotFoundError: [Errno 2] No such file or directory: './task4_centrality.csv'

In [None]:
# Step 2: Clean restaurant_name for consistent merging
def clean_name(name):
    if pd.isna(name):
        return name
    # Strip spaces, lowercase, remove special characters, normalize spaces
    name = name.strip().lower()
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

centrality_df['restaurant_name'] = centrality_df['restaurant_name'].apply(clean_name)
df['restaurant_name'] = df['restaurant_name'].apply(clean_name)

# Verify common restaurant_names
common_names = len(set(centrality_df['restaurant_name']) & set(df['restaurant_name']))
print(f"Common restaurant_names: {common_names}")
print("Sample centrality restaurant_names:", centrality_df['restaurant_name'].head().tolist())
print("Sample restaurant dataset names:", df['restaurant_name'].head().tolist())

# Debug unmatched names
unmatched = set(centrality_df['restaurant_name']) - set(df['restaurant_name'])
if unmatched:
    print("Sample unmatched centrality names:", list(unmatched)[:5])

# Step 3: Parse popularity_generic into numeric popularity_score (1/rank)
def parse_popularity_generic(text):
    if pd.isna(text) or text is None or text.lower() == 'none':
        return np.nan
    # Extract ranking number (e.g., "#1" -> 1)
    match = re.search(r'#(\d+)', str(text))
    if match:
        rank = int(match.group(1))
        return 1.0 / rank if rank > 0 else np.nan  # Inverse rank for higher popularity = higher score
    return np.nan

df['popularity_score'] = df['popularity_generic'].apply(parse_popularity_generic)
print("\nSample popularity_generic and parsed popularity_score:")
print(df[['popularity_generic', 'popularity_score']].head(10))
print("popularity_score NaN count:", df['popularity_score'].isna().sum())

# Step 4: Identify available metrics
available_metrics = []
for col in ['popularity_score', 'total_reviews_count', 'avg_rating']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        if df[col].isna().sum() < len(df):
            available_metrics.append(col)
    else:
        print(f"Warning: Column {col} not found in task2_df.parquet")
if not available_metrics:
    raise ValueError("No valid metrics (popularity_score, total_reviews_count, avg_rating) found")

print(f"Available metrics: {available_metrics}")

# Step 5: Merge centrality scores with metrics
analysis_df = centrality_df[['restaurant_name', 'degree_centrality', 'closeness_centrality', 
                            'betweenness_centrality', 'eigenvector_centrality']].merge(
    df[['restaurant_name'] + available_metrics],
    on='restaurant_name',
    how='left'
)
print(f"Merged data: {len(analysis_df)} rows")

# Step 6: Handle missing or invalid data
numeric_columns = ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 
                   'eigenvector_centrality'] + available_metrics
for col in numeric_columns:
    analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

# Check NaN values
print("\nNaN counts in numeric columns:")
print(analysis_df[numeric_columns].isna().sum())

# Impute NaN for metrics
for col in available_metrics:
    if col in ['total_reviews_count']:
        analysis_df[col] = analysis_df[col].fillna(0)
    else:  # popularity_score, avg_rating
        analysis_df[col] = analysis_df[col].fillna(analysis_df[col].mean())

# Drop rows where all centrality scores are NaN
analysis_df = analysis_df.dropna(subset=['degree_centrality', 'eigenvector_centrality'], how='all')
print(f"Data after cleaning: {len(analysis_df)} rows")

# Step 7: Compute Pearson correlations
correlation_matrix = analysis_df[numeric_columns].corr(method='pearson')
print("\nPearson Correlation Matrix:")
print(correlation_matrix)

# Extract relevant correlations
correlations = correlation_matrix.loc[
    ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality'],
    available_metrics
]
print("\nCentrality vs. Metrics Correlations:")
print(correlations)

# Step 8: Interpret results
print("\nInterpretations and Implications:")
print("Note: popularity_generic was parsed into popularity_score (1/rank, higher score = higher popularity).")
for centrality in ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality']:
    print(f"\n{centrality.replace('_', ' ').title()}:")
    for metric in available_metrics:
        corr = correlations.loc[centrality, metric]
        strength = "strong" if abs(corr) > 0.5 else "moderate" if abs(corr) > 0.3 else "weak"
        direction = "positive" if corr > 0 else "negative"
        print(f"- Correlation with {metric}: {corr:.4f} ({strength}, {direction})")
        print(f"  Implication: {centrality.replace('_', ' ').title()} {'is' if abs(corr) > 0.3 else 'is not'} strongly linked to {metric}.")

# Step 9: Visualize relationships (Scatter Plots)
n_metrics = len(available_metrics)
plt.figure(figsize=(15, 4 * n_metrics))
plot_idx = 1
for centrality in ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality']:
    for metric in available_metrics:
        plt.subplot(4, n_metrics, plot_idx)
        sns.scatterplot(data=analysis_df, x=centrality, y=metric, alpha=0.5)
        plt.title(f"{centrality.replace('_', ' ').title()} vs {metric.replace('_', ' ').title()}")
        plt.xlabel(centrality.replace('_', ' ').title())
        plt.ylabel(metric.replace('_', ' ').title())
        plot_idx += 1
plt.tight_layout()
plt.savefig('./task5_scatter_plots.png')
print("Scatter plots saved to /task5_scatter_plots.png")

# Step 10: Visualize correlation matrix (Heatmap)
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Matrix: Centrality vs Metrics')
plt.savefig('./task5_heatmap.png')
print("Correlation heatmap saved to /task5_heatmap.png")

# Step 11: Save results
correlations.to_csv('./task5_correlations.csv')
analysis_df.to_csv('./task5_analysis.csv', index=False)
print("Saved correlations to /task5_correlations.csv and analysis to /task5_analysis.csv")

### 6. Task
## Community Detection (Unweighted)

- Apply community detection (e.g., Louvain or Girvan-Newman) on the unweighted network.
- Visualize and label resulting communities, analyze their shared features.


In [112]:

G_weighted = nx.read_gml('/kaggle/input/sna-project-files/task2_graph.gml')
df = pd.read_parquet('/kaggle/input/sna-project-files/task2_df.parquet')
print(f"Loaded weighted graph: {G_weighted.number_of_nodes()} nodes, {G_weighted.number_of_edges()} edges")
print(f"Loaded DataFrame: {len(df)} rows")


Loaded weighted graph: 4846 nodes, 11238 edges
Loaded DataFrame: 5000 rows


### Community Louvain Common Function for Task 6 and 7

In [113]:
def communityLouvainFuc(G, weight=False):
    
    # communityw = "community_weighted" if weight==True else "community_unweighted"
    
    if weight==False:
        partition = community_louvain.best_partition(G, random_state=42)
    else:
        partition = community_louvain.best_partition(G, weight='weight', random_state=42)
        
    print(f"Detected {len(set(partition.values()))} communities")
    
    # Create DataFrame with community assignments
    community_df = pd.DataFrame({
        'restaurant_name': list(partition.keys()),
        "community_weighted" if weight==True else "community_unweighted" : list(partition.values())
    })
    
    # Merge with original DataFrame for attributes
    community_df = community_df.merge(
        df[['restaurant_name', 'city', 'cuisines', 'special_diets', 'features', 'latitude', 'longitude']],
        on='restaurant_name',
        how='left'
    )

    return community_df




In [114]:
# def safe_literal_eval(val):
#     if isinstance(val, str):
#         try:
#             return ast.literal_eval(val)
#         except (ValueError, SyntaxError):
#             return []  
#     return val

# def parse_list_column(col):
#     return col.apply(safe_literal_eval)

# df['cuisines'] = parse_list_column(df['cuisines'])
# df['special_diets'] = parse_list_column(df['special_diets'])
# df['features'] = parse_list_column(df['features'])

# print(df['cuisines'])


# Step 1: Convert to unweighted graph
G = G_weighted.copy()
for u, v in G.edges():
    G[u][v].pop('weight', None)
print(f"Unweighted graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


print(f"Connected components: {len(list(nx.connected_components(G)))}")
isolated = sum(1 for node in G.nodes() if G.degree(node) == 0)
print(f"Isolated nodes: {isolated}")


# Step 2: Apply Louvain community detection on weighted graph

community_unweighted= communityLouvainFuc(G)



Unweighted graph: 4846 nodes, 11238 edges
Connected components: 2961
Isolated nodes: 2417
Detected 2972 communities


In [115]:
# Step 3: Analyze communities
community_unweighted_sizes = community_unweighted['community_unweighted'].value_counts()
print("\nTop 5 Communities by Size:")
print(community_unweighted_sizes.head())

# Analyze shared features for top 3 communities
cuisines=[]
cities=[]
special_diets=[]
features=[]


# Safely parse comma-separated cuisines into a list
def extract_cuisine_items(column):
    all_cuisines = []
    for item in column:
        if isinstance(item, str):
            if item.strip() != '[]':  # exclude empty lists
                all_cuisines.extend([c.strip() for c in item.split(',')])
        elif isinstance(item, list):  # already parsed
            all_cuisines.extend(item)
    return all_cuisines

# Analysis block

top_communities = community_unweighted_sizes.head(10).index
print("\nShared Features of Top 3 Communities:")

for comm in top_communities:
    comm_df = community_unweighted[community_unweighted['community_unweighted'] == comm]
    print(f"\nCommunity {comm} ({len(comm_df)} restaurants):")

    cities += extract_cuisine_items(comm_df['city'])
    print(f"\nTop 5 cities: {Counter(cities).most_common(5)}")
    
    cuisines += extract_cuisine_items(comm_df['cuisines'])
    
    print(f"\nTop 5 cuisines: {Counter(cuisines).most_common(5)}")
    
    special_diets += extract_cuisine_items(comm_df['special_diets'])
    print(f"\nTop 5 special_diets: {Counter(special_diets).most_common(5)}")

    features += extract_cuisine_items(comm_df['features'])
    print(f"\nTop 5 features: {Counter(features).most_common(5)}")


Top 5 Communities by Size:
community_unweighted
26     156
201    102
2       98
44      97
217     60
Name: count, dtype: int64

Shared Features of Top 3 Communities:

Community 26 (156 restaurants):

Top 5 cities: [('Unknown', 113), ('Helsinki', 4), ('Dresden', 4), ('Dinslaken', 3), ('Weil am Rhein', 2)]

Top 5 cuisines: [('European', 25), ('Fast food', 23), ('British', 23), ('Mediterranean', 18), ('Italian', 16)]

Top 5 special_diets: [('Vegetarian Friendly', 49), ('Vegan Options', 31), ('Gluten Free Options', 23), ('Halal', 5)]

Top 5 features: [('Seating', 34), ('Wheelchair Accessible', 24), ('Takeout', 23), ('Reservations', 20), ('Table Service', 17)]

Community 201 (102 restaurants):

Top 5 cities: [('Unknown', 140), ('Frankfurt', 13), ('Dusseldorf', 9), ('Hannover', 5), ('Sheffield', 5)]

Top 5 cuisines: [('European', 39), ('Fast food', 37), ('British', 34), ('Cafe', 29), ('Mediterranean', 23)]

Top 5 special_diets: [('Vegetarian Friendly', 80), ('Vegan Options', 48), ('Gluten

In [116]:

G_top = G.subgraph(top_communities)
print(f"\nTop communities subgraph: {G_top.number_of_nodes()} nodes, {G_top.number_of_edges()} edges")

net = Network(notebook=True, height="600px", width="100%", directed=False, cdn_resources='in_line')
net.from_nx(G_top)

net = Network(notebook=True, height="600px", width="100%", directed=False, cdn_resources='in_line')

# Add nodes with colors and titles
colors = ['#FF9999', '#66CC99', '#99CCFF', '#FFCC99', '#CC99FF']
for node in G_top.nodes():
    comm = partition[node]
    city = community_unweighted[community_unweighted['restaurant_name'] == node]['city'].iloc[0]
    net.add_node(node, label=node, title=f"{node}\nCommunity: {comm}\nCity: {city}", color=colors[comm % len(colors)])

# Add edges with color
for source, target in G_top.edges():
    net.add_edge(source, target, color='#888888')

net.show_buttons(filter_=['physics'])
net.show("/kaggle/working/task6_community_graph.html")
print("Community visualization saved to /kaggle/working/task6_community_graph.html")



Top communities subgraph: 0 nodes, 0 edges
/kaggle/working/task6_community_graph.html
Community visualization saved to /kaggle/working/task6_community_graph.html


In [136]:

from IPython.display import IFrame
def showMyHtmlGraph(path):
    display(IFrame(path, width="100%", height="600px"))



In [137]:
# # Step 6: Save results

# display html graph
showMyHtmlGraph('task6_community_graph.html')

community_unweighted.to_csv('/kaggle/working/task6_communities.csv', index=False)
nx.write_gml(G, '/kaggle/working/task6_unweighted_graph.gml')
print("Saved community assignments to /kaggle/working/task6_communities.csv and unweighted graph to /kaggle/working/task6_unweighted_graph.gml")

Saved community assignments to /kaggle/working/task6_communities.csv and unweighted graph to /kaggle/working/task6_unweighted_graph.gml


### 7. Task
## Community Detection (Weighted)

- Run the same algorithm on the weighted network.
- Compare community structure between the two versions.

In [119]:
G_weighted = nx.read_gml('/kaggle/input/sna-project-files/task2_graph.gml')
df = pd.read_parquet('/kaggle/input/sna-project-files/task2_df.parquet')
print(f"Loaded weighted graph: {G_weighted.number_of_nodes()} nodes, {G_weighted.number_of_edges()} edges")
print(f"Loaded DataFrame: {len(df)} rows")


Loaded weighted graph: 4846 nodes, 11238 edges
Loaded DataFrame: 5000 rows


In [120]:

#  Apply Louvain community detection on weighted graph loaded from task 2

community_df_weighted= communityLouvainFuc(G, weight=True)


Detected 2972 communities


In [121]:
community_sizes_weighted = community_df_weighted['community_weighted'].value_counts()
print("\nTop 5 Weighted Communities by Size:")
print(community_sizes_weighted.head())


# Analyze shared features for top 3 communities
cuisines=[]
cities=[]
special_diets=[]
features=[]


top_communities = community_sizes_weighted.head(10).index
print("\nShared Features of Top 3 Communities:")

for comm in top_communities:
    comm_df = community_df_weighted[community_df_weighted['community_weighted'] == comm]
    print(f"\nCommunity {comm} ({len(comm_df)} restaurants):")

    cities += extract_cuisine_items(comm_df['city'])
    print(f"\nTop 5 cities: {Counter(cities).most_common(5)}")
    
    cuisines += extract_cuisine_items(comm_df['cuisines'])
    
    print(f"\nTop 5 cuisines: {Counter(cuisines).most_common(5)}")
    
    special_diets += extract_cuisine_items(comm_df['special_diets'])
    print(f"\nTop 5 special_diets: {Counter(special_diets).most_common(5)}")

    features += extract_cuisine_items(comm_df['features'])
    print(f"\nTop 5 features: {Counter(features).most_common(5)}")




Top 5 Weighted Communities by Size:
community_weighted
26     156
201    102
2       98
44      97
217     60
Name: count, dtype: int64

Shared Features of Top 3 Communities:

Community 26 (156 restaurants):

Top 5 cities: [('Unknown', 113), ('Helsinki', 4), ('Dresden', 4), ('Dinslaken', 3), ('Weil am Rhein', 2)]

Top 5 cuisines: [('European', 25), ('Fast food', 23), ('British', 23), ('Mediterranean', 18), ('Italian', 16)]

Top 5 special_diets: [('Vegetarian Friendly', 49), ('Vegan Options', 31), ('Gluten Free Options', 23), ('Halal', 5)]

Top 5 features: [('Seating', 34), ('Wheelchair Accessible', 24), ('Takeout', 23), ('Reservations', 20), ('Table Service', 17)]

Community 201 (102 restaurants):

Top 5 cities: [('Unknown', 140), ('Frankfurt', 13), ('Dusseldorf', 9), ('Hannover', 5), ('Sheffield', 5)]

Top 5 cuisines: [('European', 39), ('Fast food', 37), ('British', 34), ('Cafe', 29), ('Mediterranean', 23)]

Top 5 special_diets: [('Vegetarian Friendly', 80), ('Vegan Options', 48), (

### Comparision between weighted and unweighted graphs

In [122]:
from sklearn.metrics import adjusted_rand_score

# Step 3: Compare unweighted (Task 6) and weighted (Task 7) communities
community_df_unweighted = community_unweighted[['restaurant_name', 'community_unweighted']]
community_sizes_unweighted = community_df_unweighted['community_unweighted'].value_counts()
print("\nComparison of Community Structures:")
print(f"Unweighted (Task 6): {len(set(community_df_unweighted['community_unweighted']))} communities")
print(f"Weighted (Task 7): {len(set(community_df_weighted['community_weighted']))} communities")

print("\nTop 5 Unweighted Community Sizes:")
print(community_sizes_unweighted.head())
print("\nTop 5 Weighted Community Sizes:")
print(community_sizes_weighted.head())

comparison_df = community_unweighted.merge(
    community_df_weighted[['restaurant_name', 'community_weighted']],
    on='restaurant_name',
    how='inner'
)
ari = adjusted_rand_score(community_unweighted['community_unweighted'], community_df_weighted['community_weighted'])
print(f"\nAdjusted Rand Index (ARI): {ari:.4f}")
print("(ARI close to 1: similar partitions; close to 0: dissimilar)")


from collections import Counter

comparison_cities = []
comparison_cuisines = []

def extract_cuisine_items(column):
    all_cuisines = []
    for item in column:
        if isinstance(item, str) and item.strip() != '[]':
            all_cuisines.extend([c.strip() for c in item.split(',')])
    return all_cuisines


top_communities_unweighted = community_sizes_unweighted.head(10).index
top_communities_weighted = community_sizes_weighted.head(10).index


# Helper function to compute Jaccard similarity for sets (e.g., cuisines)
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

# Compare shared features of top 3 communities
print("\n=== Shared Features Comparison (Top 3 Communities) ===")
for comm_type, comm_df, top_comms, prefix in [
    ('Unweighted', community_unweighted, top_communities_unweighted, 'community_unweighted'),
    ('Weighted', community_df_weighted, top_communities_weighted, 'community_weighted')
]:
    print(f"\n--- {comm_type} Top 5 Communities ---")
    for comm in top_comms:
        comm_data = comm_df[comm_df[prefix] == comm]
        print(f"\nCommunity {comm} ({len(comm_data)} restaurants):")
        cities = Counter(comm_data['city'].dropna())
        print(f"  Top Cities: {cities.most_common(10)}")
        cuisines = extract_cuisine_items(comm_data['cuisines'])
        cuisine_counts = Counter(cuisines)
        print(f"  Top Cuisines: {cuisine_counts.most_common(10)}")
        # print(f"  Top Cuisines: {cuisines.most_common(5)}")
        
        # special_diets = Counter([sd for sublist in comm_data['special_diets'].dropna() for sd in sublist])
        special_diets = extract_cuisine_items(comm_data['special_diets'])
        special_diets_counts = Counter(special_diets)
        print(f"  Top Special Diets: {special_diets_counts.most_common(10)}")
        # features = Counter([f for sublist in comm_data['features'].dropna() for f in sublist])
        features = extract_cuisine_items(comm_data['features'])
        features_counts = Counter(features)
        print(f"  Top Features: {features_counts.most_common(10)}")







Comparison of Community Structures:
Unweighted (Task 6): 2972 communities
Weighted (Task 7): 2972 communities

Top 5 Unweighted Community Sizes:
community_unweighted
26     156
201    102
2       98
44      97
217     60
Name: count, dtype: int64

Top 5 Weighted Community Sizes:
community_weighted
26     156
201    102
2       98
44      97
217     60
Name: count, dtype: int64

Adjusted Rand Index (ARI): 1.0000
(ARI close to 1: similar partitions; close to 0: dissimilar)

=== Shared Features Comparison (Top 3 Communities) ===

--- Unweighted Top 5 Communities ---

Community 26 (156 restaurants):
  Top Cities: [('Unknown', 113), ('Helsinki', 4), ('Dresden', 4), ('Dinslaken', 3), ('Weil am Rhein', 2), ('Llanelli', 2), ('Backnang', 2), ('Wuppertal', 2), ('Delmenhorst', 2), ('Nuremberg', 2)]
  Top Cuisines: [('European', 25), ('Fast food', 23), ('British', 23), ('Mediterranean', 18), ('Italian', 16), ('Cafe', 13), ('Pub', 10), ('American', 9), ('Asian', 9), ('Middle Eastern', 8)]
  Top Sp

In [123]:
def extract_cuisine_set(series):
    cuisines = set()
    for item in series.dropna():
        if isinstance(item, str):
            cuisines.update([c.strip() for c in item.split(',')])
        elif isinstance(item, list):
            cuisines.update(item)
    return cuisines


# Compare cuisine overlap between top communities
print("\n=== Cuisine Overlap (Jaccard Similarity) ===\n\n")

for i, (comm_unw, comm_w) in enumerate(zip(top_communities[:1], top_communities_weighted[:1])):
    unw_data = community_unweighted[community_unweighted['community_unweighted'] == comm_unw]
    w_data = community_df_weighted[community_df_weighted['community_weighted'] == comm_w]
    # print("Unweighted Data Columns:", unw_data.columns.tolist())
    # print("Weighted Data Columns:", w_data.columns.tolist())
    # print("Unweighted Data Columns: Cuisines -> ", w_data['cuisines'].head())
    # print("Weighted Data Columns: Cuisines -> ", w_data['cuisines'].head())

    # unw_cuisines = set(c for sublist in unw_data['cuisines'].dropna())
    # w_cuisines = set(c for sublist in w_data['cuisines'].dropna() for c in sublist)
    unw_cuisines = extract_cuisine_set(unw_data['cuisines'])
    w_cuisines = extract_cuisine_set(w_data['cuisines'])
    jaccard = jaccard_similarity(unw_cuisines, w_cuisines)
    print(f"Top Community {i+1} (Unweighted {comm_unw} vs. Weighted {comm_w}): Jaccard Similarity = {jaccard:.4f}")





=== Cuisine Overlap (Jaccard Similarity) ===


Top Community 1 (Unweighted 26 vs. Weighted 26): Jaccard Similarity = 1.0000


In [124]:


# Add nodes with colors and titles
colors = ['#FF9999', '#66CC99', '#99CCFF', '#FFCC99', '#CC99FF']
for node in G_top.nodes():
    comm = partition[node]
    city = community_df_weighted[community_df_weighted['restaurant_name'] == node]['city'].iloc[0]
    net.add_node(node, label=node, title=f"{node}\nCommunity: {comm}\nCity: {city}", color=colors[comm % len(colors)])

# Add edges with color
for source, target in G_top.edges():
    net.add_edge(source, target, color='#888888')



net.show_buttons(filter_=['physics'])
net.show("/kaggle/working/task7_community_weighted_graph.html")
print("Community visualization saved to /kaggle/working/task7_community_weighted_graph.html")

# net.show("/kaggle/working/task6_community_graph.html")  # This creates the file
# display(IFrame("task7_community_graph.html", width="100%", height="600px"))
showMyHtmlGraph('task7_community_weighted_graph.html')



/kaggle/working/task7_community_weighted_graph.html
Community visualization saved to /kaggle/working/task7_community_weighted_graph.html


In [125]:
# Step 5: Save results
community_df_weighted.to_csv('/kaggle/working/task7_communities.csv', index=False)
nx.write_gml(G, '/kaggle/working/task7_weighted_graph.gml')
print("Saved community assignments to /kaggle/working/task7_communities.csv and unweighted graph to /kaggle/working/task7_weighted_graph.gml")

Saved community assignments to /kaggle/working/task7_communities.csv and unweighted graph to /kaggle/working/task7_weighted_graph.gml


### 8. Role of Dietary Preferences

- Filter nodes with vegetarian_friendly, vegan_options, or gluten_free tags.
- Analyze their position in the network (e.g., density, centrality, community inclusion).

In [126]:

# Load Task 2 weighted graph, DataFrame, and Task 7 communities
G = nx.read_gml('/kaggle/input/sna-project-files/task2_graph.gml')
df = pd.read_parquet('/kaggle/input/sna-project-files/task2_df.parquet')
community_df_weighted = pd.read_csv('/kaggle/working/task7_communities.csv')
print(f"Loaded weighted graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"Loaded DataFrame: {len(df)} rows")
print(f"Loaded Task 7 communities: {len(community_df_weighted)} restaurants")


Loaded weighted graph: 4846 nodes, 11238 edges
Loaded DataFrame: 5000 rows
Loaded Task 7 communities: 5000 restaurants




### Step 1: Filter nodes with vegetarian_friendly, vegan_options, or gluten_free tags
### Parse special_diets column (stringified lists)

In [127]:

print(df.columns.tolist())

 # Define dietary tags

dietary_tags = ['vegetarian_friendly', 'vegan_options', 'gluten_free']
dietary_nodes = []

# Loop through each dietary tag column
for tag in dietary_tags:
    if tag in df.columns:
        matches = df[df[tag] == 'Y']['restaurant_name'].tolist()
        dietary_nodes.extend(matches)

# Remove duplicates
dietary_nodes = list(set(dietary_nodes))
print(f"\n\n====== \n\n Found  {len(dietary_nodes)} restaurants with dietary tags\n\n======")


['restaurant_link', 'restaurant_name', 'original_location', 'country', 'region', 'province', 'city', 'address', 'latitude', 'longitude', 'claimed', 'awards', 'popularity_detailed', 'popularity_generic', 'top_tags', 'price_level', 'price_range', 'meals', 'cuisines', 'special_diets', 'features', 'vegetarian_friendly', 'vegan_options', 'gluten_free', 'original_open_hours', 'open_days_per_week', 'open_hours_per_week', 'working_shifts_per_week', 'avg_rating', 'total_reviews_count', 'default_language', 'reviews_count_in_default_language', 'excellent', 'very_good', 'average', 'poor', 'terrible', 'food', 'service', 'value', 'atmosphere', 'keywords']



 Found  1585 restaurants with dietary tags



In [128]:

# Step 2: Analyze network position
# Create subgraph of dietary nodes

G_dietary = G.subgraph(dietary_nodes)
print(f"Dietary subgraph: {G_dietary.number_of_nodes()} nodes, {G_dietary.number_of_edges()} edges")

# Density of dietary subgraph
density = nx.density(G_dietary)

full_graph_density = nx.density(G)
print(f"\nDensity Analysis:")
print(f"Dietary subgraph density: {density:.6f}")
print(f"Full graph density: {full_graph_density:.6f}")
print("(Higher density means more connections among dietary restaurants)")

# Centrality in the full graph
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, k=min(1000, G.number_of_nodes()), weight='weight')

# Compute centrality for dietary nodes
dietary_centrality = pd.DataFrame({
    'restaurant_name': dietary_nodes,
    'degree_centrality': [degree_centrality[node] for node in dietary_nodes],
    'betweenness_centrality': [betweenness_centrality[node] for node in dietary_nodes]
})


# Average centrality for dietary vs. all nodes
avg_degree_dietary = dietary_centrality['degree_centrality'].mean()
avg_degree_all = sum(degree_centrality.values()) / len(degree_centrality)
avg_betweenness_dietary = dietary_centrality['betweenness_centrality'].mean()
avg_betweenness_all = sum(betweenness_centrality.values()) / len(betweenness_centrality)

print(f"\nCentrality Analysis:")
print(f"Average degree centrality (dietary): {avg_degree_dietary:.6f}")
print(f"Average degree centrality (all): {avg_degree_all:.6f}")
print(f"Average betweenness centrality (dietary): {avg_betweenness_dietary:.6f}")
print(f"Average betweenness centrality (all): {avg_betweenness_all:.6f}")
print("(Higher centrality means dietary restaurants are more connected or influential)")


Dietary subgraph: 1585 nodes, 1251 edges

Density Analysis:
Dietary subgraph density: 0.000997
Full graph density: 0.000957
(Higher density means more connections among dietary restaurants)

Centrality Analysis:
Average degree centrality (dietary): 0.000972
Average degree centrality (all): 0.000957
Average betweenness centrality (dietary): 0.000013
Average betweenness centrality (all): 0.000011
(Higher centrality means dietary restaurants are more connected or influential)


In [129]:

# Top 5 dietary nodes by centrality
print(f"\nTop 5 Dietary Restaurants by Degree Centrality:")
print(dietary_centrality.sort_values('degree_centrality', ascending=False)[['restaurant_name', 'degree_centrality']].head())
print(f"\nTop 5 Dietary Restaurants by Betweenness Centrality:")
print(dietary_centrality.sort_values('betweenness_centrality', ascending=False)[['restaurant_name', 'betweenness_centrality']].head())

# Community inclusion (using Task 7 weighted communities)
dietary_communities = community_df_weighted[community_df_weighted['restaurant_name'].isin(dietary_nodes)]
community_distribution = dietary_communities['community_weighted'].value_counts()
print(f"\nCommunity Inclusion Analysis:")
print(f"Dietary restaurants in Task 7 communities (top 5):")
print(community_distribution.head())
print(f"Total communities with dietary restaurants: {len(community_distribution)}")

# Analyze dietary tags per community


# from collections import Counter

# Define dietary tags
# dietary_tags = ['vegetarian_friendly', 'vegan_options', 'gluten_free']

print("\n=== Dietary Tags in Top 10 Communities ===")
top_communities = community_df_weighted['community_weighted'].value_counts().head(10).index

for comm in top_communities:
    # Filter the community
    comm_df = community_df_weighted[community_df_weighted['community_weighted'] == comm]
    
    # Find all restaurant names in this community
    comm_nodes = comm_df['restaurant_name'].tolist()
    
    # Initialize tag counter
    tag_counter = Counter()
    
    # For each tag, check which restaurants in this community have 'Y'
    for tag in dietary_tags:
        # Get restaurant names where the tag is 'Y'
        y_nodes = df[df[tag] == 'Y']['restaurant_name']
        
        # Count how many of those nodes are in the current community
        count = sum(name in comm_nodes for name in y_nodes)
        tag_counter[tag] = count

    print(f"\nCommunity {comm} ({len(comm_df)} total restaurants):")
    for tag, count in tag_counter.items():
        print(f"  {tag}: {count} restaurants with '{tag}' = Y")




Top 5 Dietary Restaurants by Degree Centrality:
         restaurant_name  degree_centrality
241        Pret a Manger           0.016718
1428        Café Etienne           0.015067
494        Le Gribouille           0.014861
1295            Angelina           0.014654
757   Restaurant Sannine           0.014448

Top 5 Dietary Restaurants by Betweenness Centrality:
           restaurant_name  betweenness_centrality
241          Pret a Manger                0.003575
1570  Bread Street Kitchen                0.003275
503         Domino's Pizza                0.002543
1524          Ugarit Sants                0.001853
1445         Mudec Bistrot                0.001353

Community Inclusion Analysis:
Dietary restaurants in Task 7 communities (top 5):
community_weighted
26     51
201    31
2      27
44     25
217    23
Name: count, dtype: int64
Total communities with dietary restaurants: 1133

=== Dietary Tags in Top 10 Communities ===

Community 26 (156 total restaurants):
  vegetarian_frien

In [132]:
# Step 3: Visualize dietary nodes
def visualize_dietary_nodes(graph, nodes, df, community_df, output_file):
    """Visualize a sample of dietary nodes with PyVis."""
   
    G_sample = graph.subgraph(nodes)
    print(f"Visualization subgraph: {G_sample.number_of_nodes()} nodes, {G_sample.number_of_edges()} edges")
    
    net = Network(notebook=True, height="600px", width="100%", directed=False, cdn_resources='in_line')
    
    colors = {
        'vegetarian_friendly': '#66CC99',
        'vegan_options': '#FF9999',
        'gluten_free': '#99CCFF',
        'multiple': '#CC99FF',
        'unknown': '#888888'
    }
    
    # Set node properties
    for node in G_sample.nodes():
        node_row = df[df['restaurant_name'] == node]
        if not node_row.empty:
            node_tags = [tag for tag in node_row['special_diets'].iloc[0] if tag in colors]
            color = colors['multiple'] if len(node_tags) > 1 else colors.get(node_tags[0] if node_tags else 'unknown')
        else:
            node_tags = ['unknown']
            color = colors['unknown']
        
        comm_row = community_df[community_df['restaurant_name'] == node]
        comm = comm_row['community_weighted'].iloc[0] if not comm_row.empty else 'unknown'
        
        net.add_node(
            node,
            label=node,
            color=color,
            title=f"{node}\nTags: {node_tags}\nCommunity: {comm}"
        )
    
    # Set edge properties
    for edge in G_sample.edges(data=True):
        net.add_edge(
            edge[0],
            edge[1],
            color='#888888',
            value=edge[2].get('weight', 1),
            title=f"Weight: {edge[2].get('weight', 1):.2f}"
        )
    
    net.show_buttons(filter_=['physics'])
    net.show(output_file)
    print(f"Visualization saved to {output_file}")

visualize_dietary_nodes(G, dietary_nodes, df, community_df_weighted, "/kaggle/working/task8_dietary_graph.html")



Visualization subgraph: 1585 nodes, 1251 edges
/kaggle/working/task8_dietary_graph.html
Visualization saved to /kaggle/working/task8_dietary_graph.html


In [134]:
# Step 4: Save results
dietary_centrality['community_weighted'] = dietary_centrality['restaurant_name'].map(
    community_df_weighted.set_index('restaurant_name')['community_weighted'].get
)
# Check for missing community assignments
missing_communities = dietary_centrality['community_weighted'].isna().sum()
print(f"Missing community assignments: {missing_communities}")
dietary_centrality.to_csv('/kaggle/working/task8_dietary_analysis.csv', index=False)
print("Saved analysis to /kaggle/working/task8_dietary_analysis.csv")

Missing community assignments: 0
Saved analysis to /kaggle/working/task8_dietary_analysis.csv


In [139]:
from IPython.display import IFrame
showMyHtmlGraph('task8_dietary_graph.html')