In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [4]:
df_features = pd.read_csv('data/features/features_interpolated.csv', encoding='latin-1', engine='python')
df_features.drop(columns=['Unnamed: 0'], inplace=True)
immigration = pd.read_csv('data/labels/OECD_acquisition_data_interpolated.csv', encoding='latin-1', engine='python')
immigration.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
print(df_features.head())
print(immigration.head())

  Country  Year  Carbon Emissions  Education Expenditure  \
0     AUS  2000          339446.6               4.887310   
1     AUS  2001          345645.0               4.889552   
2     AUS  2002          353371.3               4.891794   
3     AUS  2003          352581.1               4.894036   
4     AUS  2004          365808.0               4.896278   

   Foreign Direct Investment (FDI) Inflows           GDP  Health Expenditure  \
0                             1.489298e+10  4.158513e+11            7.599617   
1                             1.071713e+10  3.793582e+11            7.682723   
2                             1.465632e+10  3.955808e+11            7.878076   
3                             8.985246e+09  4.674980e+11            7.882926   
4                             4.290767e+10  6.143264e+11            8.090034   

   Inflation Rate  Internet Penetration  Life Expectancy  \
0        4.457435             46.756116        79.234146   
1        4.407135             52.68926

For each year, create a graph based on the similarities of the countries

In [6]:
# get all features names except country and year
features = df_features.columns[2:]
years = immigration['Year'].unique()
countries = immigration['CO2'].unique()

print(features)
print(years)
print(countries)

Index(['Carbon Emissions', 'Education Expenditure',
       'Foreign Direct Investment (FDI) Inflows', 'GDP', 'Health Expenditure',
       'Inflation Rate', 'Internet Penetration', 'Life Expectancy',
       'Renewable Energy Production', 'Unemployment Rate'],
      dtype='object')
[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020]
['GBR' 'BEL' 'GRC' 'CHE' 'SVN' 'ITA' 'MEX' 'DEU' 'CHL' 'USA' 'FRA' 'POL'
 'LUX' 'HUN' 'NOR' 'FIN' 'IRL' 'SWE' 'ESP' 'DNK' 'CAN' 'ISL' 'AUT' 'AUS'
 'NLD' 'LVA' 'NZL']


In [7]:
# create a threshold for each feature similarity score to be considered
threshold_ce = 0.5
threshold_ee = 0.5
threshold_fdi = 0.5
threshold_gpd = 0.5
threshold_he = 0.5
threshold_ir = 0.5
threshold_ip = 0.5
threshold_le = 0.5
threshold_rep = 0.5
threshold_ur = 0.5

# Set the minimum number of threshold crossings required
min_threshold_crossings = 5

similarity_thresholds = [threshold_ce, threshold_ee, threshold_fdi, threshold_gpd, threshold_he,
                         threshold_ir, threshold_ip, threshold_le, threshold_rep, threshold_ur]

In [8]:
def compute_similarities(df,k=2):
    # similarity_ce = cosine_similarity(df['Carbon Emissions'].values.reshape(-1, 1), df['Carbon Emissions'].values.reshape(-1, 1))
    # similarity_ee = cosine_similarity(df['Education Expenditure'].values.reshape(-1, 1), df['Education Expenditure'].values.reshape(-1, 1))
    # similarity_fdi = cosine_similarity(df['Foreign Direct Investment (FDI) Inflows'].values.reshape(-1, 1), df['Foreign Direct Investment (FDI) Inflows'].values.reshape(-1, 1))
    # similarity_gpd = cosine_similarity(df['GDP'].values.reshape(-1, 1), df['GDP'].values.reshape(-1, 1))
    # similarity_he = cosine_similarity(df['Health Expenditure'].values.reshape(-1, 1), df['Health Expenditure'].values.reshape(-1, 1))
    # similarity_ir = cosine_similarity(df['Inflation Rate'].values.reshape(-1, 1), df['Inflation Rate'].values.reshape(-1, 1))
    # similarity_ip = cosine_similarity(df['Internet Penetration'].values.reshape(-1, 1), df['Internet Penetration'].values.reshape(-1, 1))
    # similarity_le = cosine_similarity(df['Life Expectancy'].values.reshape(-1, 1), df['Life Expectancy'].values.reshape(-1, 1))
    # similarity_rep = cosine_similarity(df['Renewable Energy Production'].values.reshape(-1, 1), df['Renewable Energy Production'].values.reshape(-1, 1))
    # similarity_ur = cosine_similarity(df['Unemployment Rate'].values.reshape(-1, 1), df['Unemployment Rate'].values.reshape(-1, 1))
    
    # similarity_ce = cdist(df['Carbon Emissions'].values.reshape(-1, 1), df['Carbon Emissions'].values.reshape(-1, 1), metric='canberra')
    # similarity_ee = cdist(df['Education Expenditure'].values.reshape(-1, 1), df['Education Expenditure'].values.reshape(-1, 1), metric='canberra')
    # similarity_fdi = cdist(df['Foreign Direct Investment (FDI) Inflows'].values.reshape(-1, 1), df['Foreign Direct Investment (FDI) Inflows'].values.reshape(-1, 1), metric='canberra')
    # similarity_gpd = cdist(df['GDP'].values.reshape(-1, 1), df['GDP'].values.reshape(-1, 1), metric='canberra')
    # similarity_he = cdist(df['Health Expenditure'].values.reshape(-1, 1), df['Health Expenditure'].values.reshape(-1, 1), metric='canberra')
    # similarity_ir = cdist(df['Inflation Rate'].values.reshape(-1, 1), df['Inflation Rate'].values.reshape(-1, 1), metric='canberra')
    # similarity_ip = cdist(df['Internet Penetration'].values.reshape(-1, 1), df['Internet Penetration'].values.reshape(-1, 1), metric='canberra')
    # similarity_le = cdist(df['Life Expectancy'].values.reshape(-1, 1), df['Life Expectancy'].values.reshape(-1, 1), metric='canberra')
    # similarity_rep = cdist(df['Renewable Energy Production'].values.reshape(-1, 1), df['Renewable Energy Production'].values.reshape(-1, 1), metric='canberra')
    # similarity_ur = cdist(df['Unemployment Rate'].values.reshape(-1, 1), df['Unemployment Rate'].values.reshape(-1, 1), metric='canberra')
    
    # return similarity_ce, similarity_ee, similarity_fdi, similarity_gpd, similarity_he, \
    #         similarity_ir, similarity_ip, similarity_le, similarity_rep, similarity_ur
    
    # Create a NearestNeighbors object
    neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')

    # Fit the data to the NearestNeighbors model
    neighbors.fit(df.values)

    # Compute the distances and indices of the k-nearest neighbors
    distances, indices = neighbors.kneighbors(df.values)

    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(df), len(df)))

    # Fill the similarity matrix with k-NN similarities
    for i in range(len(df)):
        for j in indices[i]:
            if i != j:
                similarity_matrix[i, j] = 1.0

    return similarity_matrix


In [19]:
graphs=[]
k = 15

# for each year create the graph
for year in years:
    print(year)
    # empty graph
    graph = nx.Graph()
    
    df = df_features[df_features['Year'] == year]
    
    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Normalize the features before computing similarity
    normalized_features = scaler.fit_transform(df[['Carbon Emissions', 'Education Expenditure', 'Foreign Direct Investment (FDI) Inflows',
                                                'GDP', 'Health Expenditure', 'Inflation Rate', 'Internet Penetration',
                                                'Life Expectancy', 'Renewable Energy Production', 'Unemployment Rate']])

    # Create a new DataFrame with the normalized features
    df_normalized = pd.DataFrame(normalized_features, columns=['Carbon Emissions', 'Education Expenditure', 'Foreign Direct Investment (FDI) Inflows',
                                                            'GDP', 'Health Expenditure', 'Inflation Rate', 'Internet Penetration',
                                                            'Life Expectancy', 'Renewable Energy Production', 'Unemployment Rate'])

    # Compute similarity using the normalized features
    # similarity_ce, similarity_ee, similarity_fdi, similarity_gpd, similarity_he, \
    #         similarity_ir, similarity_ip, similarity_le, similarity_rep, similarity_ur = compute_similarities(df_normalized)
    similarity_matrix = compute_similarities(df_normalized,k)
            
    # Add countries as nodes to the graph and add all their features as node attributes
    for i, row in df.iterrows():
        graph.add_node(row['Country'], x=[row['Carbon Emissions'],
                                          row['Education Expenditure'],
                                          row['Foreign Direct Investment (FDI) Inflows'],
                                          row['GDP'],
                                          row['Health Expenditure'],
                                          row['Inflation Rate'],
                                          row['Internet Penetration'],
                                          row['Life Expectancy'],
                                          row['Renewable Energy Production'],
                                          row['Unemployment Rate']])

        
    # Add edges based on similarity scores and threshold crossings
    num_countries = len(countries)
    for i in range(num_countries):
        for j in range(i + 1, num_countries):
            country1 = countries[i]
            country2 = countries[j]
            # Check if the nodes already exist in the graph
            if country1 in graph.nodes() and country2 in graph.nodes():
                # similarity_score = [similarity_ce[i][j], similarity_ee[i][j], similarity_fdi[i][j], similarity_gpd[i][j], similarity_he[i][j],
                #                      similarity_ir[i][j], similarity_ip[i][j], similarity_le[i][j], similarity_rep[i][j], similarity_ur[i][j]]

                # Count the number of threshold crossings
                # threshold_crossings = sum(score > threshold for score, threshold in zip(similarity_scores, similarity_thresholds))

                # if threshold_crossings >= min_threshold_crossings:
                
                similarity_score = similarity_matrix[i, j]
            
                if similarity_score > 0:
                    edge_value = immigration[(immigration['CO2'] == country1) & (immigration['COU'] == country2) & (immigration['Year'] == year)]['Value'].values[0]
            
                    # graph.add_edge(country1, country2, weight=sum(similarity_score), edge_value=edge_value)  # Add edge with maximum similarity score as weight and edge value
                    graph.add_edge(country1, country2, weight=similarity_score, edge_value=edge_value)  # Add edge with maximum similarity score as weight and edge value

    
    # Calculate the number of edges and nodes in the graph
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()

    # Calculate the maximum number of edges possible in an undirected graph
    max_edges = (num_nodes * (num_nodes - 1)) / 2

    # Calculate the sparsity of the graph
    sparsity = num_edges / max_edges

    # Print the sparsity
    print("Graph Sparsity:", sparsity)
    
    graphs.append(graph)

2000
Graph Sparsity: 0.43304843304843305
2001
Graph Sparsity: 0.5071225071225072
2002
Graph Sparsity: 0.5356125356125356
2003
Graph Sparsity: 0.43304843304843305
2004
Graph Sparsity: 0.4444444444444444
2005
Graph Sparsity: 0.4415954415954416
2006
Graph Sparsity: 0.45014245014245013
2007
Graph Sparsity: 0.43874643874643876
2008
Graph Sparsity: 0.48717948717948717
2009
Graph Sparsity: 0.452991452991453
2010
Graph Sparsity: 0.4586894586894587
2011
Graph Sparsity: 0.4700854700854701
2012
Graph Sparsity: 0.43304843304843305
2013
Graph Sparsity: 0.43874643874643876
2014
Graph Sparsity: 0.4586894586894587
2015
Graph Sparsity: 0.4415954415954416
2016
Graph Sparsity: 0.4472934472934473
2017
Graph Sparsity: 0.4358974358974359
2018
Graph Sparsity: 0.43304843304843305
2019
Graph Sparsity: 0.43874643874643876
2020
Graph Sparsity: 0.452991452991453


In [67]:
graphs[0]
df_normalized

Unnamed: 0,Carbon Emissions,Education Expenditure,Foreign Direct Investment (FDI) Inflows,GDP,Health Expenditure,Inflation Rate,Internet Penetration,Life Expectancy,Renewable Energy Production,Unemployment Rate
0,0.079918,0.649351,0.282224,0.065569,0.393326,0.446822,0.861146,0.959583,0.021922,0.154116
1,0.013108,0.426407,0.261103,0.019527,0.506579,0.427782,0.711663,0.808465,0.347856,0.246935
2,0.018972,0.781385,0.280949,0.02441,0.424194,0.363871,0.722171,0.856215,0.023025,0.25394
3,0.120134,0.448052,0.349908,0.084276,0.469827,0.550844,0.723944,0.908008,0.187095,0.359019
4,0.007421,0.458874,0.255995,0.033278,0.478558,0.0,0.834156,1.0,0.223632,0.152364
5,0.018632,0.545048,0.26606,0.012514,0.277982,0.77197,0.617572,0.640186,0.230525,0.524518
6,0.136161,0.337662,0.365037,0.181813,0.553515,0.486521,0.667411,0.783678,0.118985,0.018389
7,0.005826,0.709957,0.265139,0.016003,0.407985,0.2489,0.966921,0.820626,0.399559,0.148862
8,0.049489,0.322511,0.314628,0.060189,0.398136,0.491703,0.766885,0.950641,0.120364,1.0
9,0.008113,0.601732,0.280665,0.011666,0.316404,0.31577,0.722899,0.859434,0.513167,0.372154


In [21]:
import torch
from torch_geometric.utils.convert import from_networkx


In [34]:
pyg_graph = from_networkx(graphs[0])
pyg_graph

tensor([[ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,
          2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
          5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
          6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,
          8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
         13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
         14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
         16, 16, 17, 17, 17,

In [55]:
migration_graphs = []
for year in immigration["Year"].unique():
    migration_g = nx.from_pandas_edgelist(df=immigration[immigration["Year"] == year], source="CO2", target="COU", edge_attr="Value")
    migration_graphs.append(migration_g)

In [66]:
migration_graphs[0].nodes(data=True)

NodeDataView({'GBR': {}, 'BEL': {}, 'GRC': {}, 'CHE': {}, 'SVN': {}, 'ITA': {}, 'MEX': {}, 'DEU': {}, 'CHL': {}, 'USA': {}, 'FRA': {}, 'POL': {}, 'LUX': {}, 'HUN': {}, 'NOR': {}, 'FIN': {}, 'IRL': {}, 'SWE': {}, 'ESP': {}, 'DNK': {}, 'CAN': {}, 'ISL': {}, 'AUT': {}, 'AUS': {}, 'NLD': {}, 'LVA': {}, 'NZL': {}})

In [57]:
pyg_graphs = [from_networkx(g) for g in migration_graphs]

In [65]:
pyg_graphs[0]

Data(edge_index=[2, 702], Value=[702], num_nodes=27)

In [64]:
train_data = pyg_graphs[:15]
test_data = pyg_graphs[15:]
len(test_data)

6

In [51]:
from torch_geometric.nn import SAGEConv

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Model(hidden_channels=32).to(device)

print(model)

Model(
  (encoder): GNNEncoder(
    (conv1): SAGEConv((-1, -1), 32, aggr=mean)
    (conv2): SAGEConv((-1, -1), 32, aggr=mean)
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(in_features=64, out_features=32, bias=True)
    (lin2): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [52]:
import torch.nn.functional as F

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [None]:
for epoch in range(1, 301):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}')