<a href="https://colab.research.google.com/github/LNshuti/eactrade/blob/main/ec_trade_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install networkx --quiet
!pip install python-louvain --quiet
!pip install scipy --quiet

In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from community import community_louvain
import seaborn as sns
import scipy

In [8]:
# Load country labels. 
country_ids = pd.read_csv("data/countries_codes_and_coordinates.csv")
country_ids.columns

Index(['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code',
       'Latitude (average)', 'Longitude (average)'],
      dtype='object')

In [7]:
# Load country labels. Load the data with these data types:
# ['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code', 'Latitude (average)', 'Longitude (average)']


country_ids = pd.read_csv("data/countries_codes_and_coordinates.csv")

# Clean column names (make them lowercase and replace spaces with underscores)
country_ids.columns = country_ids.columns.str.lower().str.replace('[ (-]', '_', regex=True).str.replace(')', '')
# Convert numeric_code to character
country_ids['numeric_code'] = country_ids['numeric_code'].astype(str)
country_ids.head()

Unnamed: 0,country,alpha_2_code,alpha_3_code,numeric_code,latitude__average,longitude__average
0,Afghanistan,"""AF""","""AFG""","""4""","""33""","""65"""
1,Albania,"""AL""","""ALB""","""8""","""41""","""20"""
2,Algeria,"""DZ""","""DZA""","""12""","""28""","""3"""
3,American Samoa,"""AS""","""ASM""","""16""","""-14.3333""","""-170"""
4,Andorra,"""AD""","""AND""","""20""","""42.5""","""1.6"""


In [None]:
# Load your data. Load the parquet file
df = pd.read_parquet('processed\country_partner_sitcproduct4digit_year_2021.parquet')

# Filter df to only include location_code for East African countries
east_africa = ['BDI', 'KEN', 'RWA', 'TZA', 'UGA', 'ZMB', 'ZWE']
df = df[df['location_code'].isin(east_africa)]
df.head()

In [None]:
# First inner join on location_id and numeric_code
merged_df = pd.merge(df, country_ids, left_on='location_id', right_on='numeric_code')

# Rename alpha_3_code to 'to'
merged_df.rename(columns={'alpha_3_code': 'to'}, inplace=True)

# Second inner join on partner_id and numeric_code
merged_df = pd.merge(merged_df, country_ids, left_on='partner_id', right_on='numeric_code', suffixes=('', '_partner'))

# Rename alpha_3_code_partner to 'from'
merged_df.rename(columns={'alpha_3_code_partner': 'from'}, inplace=True)

# Drop unnecessary columns
merged_df.drop(columns=['location_id', 'partner_id', 'sitc_eci', 'sitc_coi', 'numeric_code', 'numeric_code_partner'], inplace=True)

# Reorder columns to have 'from' and 'to' at the beginning
columns = ['from', 'to'] + [col for col in merged_df.columns if col not in ['from', 'to']]
eaccountries_trade_df = merged_df[columns]

In [None]:
# Calculate total imports for each location
node_sizes = df.groupby(['product_id', 'location_code', 'partner_code'])['import_value'].sum().to_dict()
# Filter nodes to only include those with > 0
node_sizes = {k: v for k, v in node_sizes.items() if v > 200009683}
node_sizes

In [None]:
# Calculate total imports for each name and partner, and keep top 10
top_10_imports = df.groupby(['product_id', 'partner_code'])['import_value'].sum().nlargest(10).to_dict()
top_10_imports

In [None]:
# Create a directed graph, filtering for top 10
G = nx.from_pandas_edgelist(df[df.apply(lambda row: (row['product_id'], row['partner_code']) in top_10_imports, axis=1)],
                           'product_id', 'partner_code', edge_attr='import_value', create_using=nx.DiGraph())
G.nodes()

In [None]:
G.edges()

In [None]:
def load_data(file_path):
    """
    Load the dataset from a parquet file.

    Args:
    file_path (str): The path to the parquet file.

    Returns:
    pd.DataFrame: The loaded dataframe.
    """
    return pd.read_parquet(file_path)

def format_currency(x, pos):
    """
    Format the value as currency with commas and a dollar sign.

    Args:
    x (float): The value to format.
    pos (int): Position index (required for the formatter but not used here).

    Returns:
    str: The formatted string.
    """
    return f"${x:,.0f}"

def filter_data(df):
    """
    Filter the necessary columns from the dataframe.

    Args:
    df (pd.DataFrame): The input dataframe.

    Returns:
    pd.DataFrame: The filtered dataframe.
    """
    return df[['product_id', 'export_value', 'import_value', 'year',
               'sitc_eci', 'sitc_coi', 'location_code', 'partner_code',
               'sitc_product_code']]

def create_network_graph(df):
    """
    Create a directed network graph from the dataframe.

    Args:
    df (pd.DataFrame): The filtered dataframe.

    Returns:
    nx.DiGraph: The created network graph.
    """
    G = nx.DiGraph()

    for index, row in df.iterrows():
        product = row['product_id']
        partner_id = row['partner_code']
        import_value = row['import_value']

        # Add nodes and weighted edges
        G.add_node(product)
        G.add_node(partner_id)
        G.add_edge(product, partner_id, weight=import_value, label='imports')

    return G


def plot_network_graph(G, partition=None):
    """
    Plot the network graph.

    Args:
    G (nx.DiGraph): The network graph to be plotted.
    partition (dict, optional): The partition of the graph for community detection.
    """
    pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 12))

    if partition:
        colors = [partition[node] for node in G.nodes()]
        nx.draw_networkx_nodes(G, pos, node_size=7000, node_color=colors, cmap=plt.cm.rainbow)
    else:
        nx.draw_networkx_nodes(G, pos, node_size=7000)

    nx.draw_networkx_labels(G, pos, font_size=12, font_weight='bold')

    edge_labels = {(u, v): f"{d['label']} ({d['weight']})" for u, v, d in G.edges(data=True)}
    nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)

    plt.title("Comprehensive Network Analysis")
    plt.show()

def calculate_network_statistics(G):
    """
    Calculate and return network statistics.

    Args:
    G (nx.DiGraph): The network graph.

    Returns:
    dict: A dictionary containing network statistics.
    """
    stats = {
        'number_of_nodes': G.number_of_nodes(),
        'number_of_edges': G.number_of_edges(),
        'average_clustering': nx.average_clustering(G),
        'average_shortest_path_length': nx.average_shortest_path_length(G) if nx.is_connected(G.to_undirected()) else 'Graph is not connected',
        'density': nx.density(G)
    }
    return stats

def detect_communities(G):
    """
    Detect communities in the network graph using the Louvain method.

    Args:
    G (nx.DiGraph): The network graph.

    Returns:
    dict: A dictionary with nodes as keys and communities as values.
    """
    partition = community_louvain.best_partition(G.to_undirected())
    return partition

def plot_import_heatmap(df):
    """
    Plots a heatmap of import_value.

    Args:
        df (pd.DataFrame): The filtered dataframe.
    """
    import_matrix = df.pivot(index="product_id", columns="partner_code", values="import_value") # Updated pivot method syntax
    plt.figure(figsize=(14, 10))
    sns.heatmap(import_matrix, annot=True, fmt=".0f", cmap="YlGnBu")
    plt.title("Rwanda's Largest Imports by Partners in 2021")
    plt.xlabel("Partner Code")
    plt.ylabel("Product ID")

    plt.show()

In [None]:
df_filtered = filter_data(df)
df_filtered

In [None]:
# Filter out rows where both imports and exports are equal to zero
df_filtered = df_filtered[(df_filtered['import_value'] != 0) | (df_filtered['export_value'] != 0)]
df_filtered

In [None]:
# Filter out rows where (if imports are equal to zero and exports less than 30000
# Or exports equal to zero and imorts less than 30000)
df_filtered = df_filtered[~(((df_filtered['import_value'] == 0) & (df_filtered['export_value'] < 180000000)) |
                           ((df_filtered['export_value'] == 0) & (df_filtered['import_value'] < 180000000)))]

df_filtered

In [None]:
# Only keep Rwanda's data
df_filtered = df_filtered[df_filtered['location_code'] == 'RWA']
df_filtered

In [None]:
G = create_network_graph(df_filtered)
G.nodes()

In [None]:
# Display the 10 most prominent edges
G.edges(data=True)

In [None]:
#plot_network_graph(G)

In [None]:
partition = detect_communities(G)
#partition

In [None]:
#plot_network_graph(G, partition)

In [None]:
# Check for duplicates in the 'name' column
print(df_filtered['product_id'].duplicated().sum())

# 1. Drop duplicates:
df_filtered = df_filtered.drop_duplicates(subset=['product_id'])
df_filtered

In [None]:
# 2. Aggregate duplicates (e.g., sum imports for the same name):
df_filtered = df_filtered.groupby('product_id').agg({'partner_code': 'first', 'import_value': 'sum'}).reset_index()

# Only keep imports greater than 2000000
df_filtered = df_filtered[df_filtered['import_value'] > 2000000]
plot_import_heatmap(df_filtered)

In [None]:
network_stats = calculate_network_statistics(G)
network_stats

### Community Detection using the Louvain Algorithm 

#### Introduction
The Louvain algorithm is a method for detecting communities in large networks. It is a hierarchical algorithm that optimizes the modularity of the partitions. We apply the Louvain algorithm using the `python-louvain` package on the dataset explored previously focusing on Rwanda's trade relationships.

#### Dataset
The dataset includes the following features:
- `product_id`: Product Id
- `import_value`: Import values
- `export_value`: Export values
- `partner_code`: Identifier for trading partners
- `year`: Year of the data

In [None]:
G_undirected = G.to_undirected()
# Apply the Louvain algorithm
partition = community_louvain.best_partition(G_undirected)

In [None]:
# Set seed for reproducibility 
scipy.random.seed(12345876)
# Function to create a graph for a specific year
def create_year_graph(year):
    G_year = nx.Graph()
    year_data = df[df['year'] == year]
    for _, row in year_data.iterrows():
        G_year.add_node(row['product_id'])
        G_year.add_edge(row['product_id'], row['partner_code'], weight=row['import_value'] + row['export_value'])
    return G_year
# Visualize the network dynamics over the years
years = sorted(df['year'].unique())
for year in years:
    G_year = create_year_graph(year)
    partition = community_louvain.best_partition(G_year)
    pos = nx.spring_layout(G_year)
    plt.figure(figsize=(10, 12))  # Adjust the size as needed

    cmap = plt.get_cmap('viridis', max(partition.values()) + 1)
    nx.draw_networkx_nodes(G_year, pos, partition.keys(), node_size=40, cmap=cmap, node_color=list(partition.values()))
    nx.draw_networkx_edges(G_year, pos, alpha=0.5)
    plt.title(f'Year: {year}')
    plt.show()