In [47]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from torch_geometric.nn import SAGEConv, global_mean_pool
from utils import filling, kmeans_clustering, plot_clusters, get_feature_propagation

In [58]:
data = torch.load('Data/santiago_zero_ismt.pt')
data.comuna = data.x[:, 8]
data.y = data.x[:,-2]
data.lat = data.x[:,0]
data.lon = data.x[:,1]
data.x = data.x[:, 2:8]
print(data)
print(data.x)
print(data.lat)
print(data.comuna)
print(data.y)
print(data.edge_index)
print(data.edge_attributes)

data.y = data.y.float()

Data(x=[355936, 6], edge_index=[2, 673565], edge_attributes=[673565, 2], comuna=[355936], y=[355936], lat=[355936], lon=[355936])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.5074, -1.1100, -1.6402,  1.2143,  1.6439,  1.6780],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])
tensor([-33.4416, -33.4420, -33.4429,  ..., -33.4390, -33.4389, -33.4386])
tensor([39., 39., 39.,  ..., 42., 42., 42.])
tensor([0.0889, 0.0889, 0.1629,  ..., 0.1620, 0.1620, 0.1620])
tensor([[     0,      1,      2,  ..., 355934, 355935, 355935],
        [  4463,   4467,    488,  ..., 191461, 251187,   7949]])
tensor([[80.8500,  5.8000],
        [18.8300,  1.3000],
        [16.0600,  1.1000],
        ...,
        [33.0400,  4.1000],
        [ 2.5300,  0.2000],
       

In [59]:
data.x = get_feature_propagation(data)

Starting feature filling
tensor([[-1.8762,  0.0880,  1.8812, -0.7552, -2.0797, -1.2788],
        [ 0.2155, -0.1407, -0.2000,  0.7840,  0.9977,  0.7484],
        [ 0.0507, -1.1760, -0.3934,  1.3322,  0.4679,  0.7087],
        ...,
        [ 1.0445, -1.0450, -1.3092,  1.1516,  1.1633,  1.0873],
        [ 1.5074, -1.1100, -1.6402,  1.2143,  1.6439,  1.6780],
        [ 1.2992, -0.1245, -1.1497,  0.2336,  1.0391,  0.9675]])
Feature filling completed. It took: 2.02s


## Train and Test split

In [50]:
index_list = data.edge_index.flatten().unique().tolist()

# Dividir en conjuntos de entrenamiento y prueba
train_index, test_index = train_test_split(index_list, test_size=0.2, random_state=42)

# Dividir el resto en conjuntos de validación y prueba
print("Training set length:", len(train_index))
print("Test set length:", len(test_index))


n_nodes, n_features = data.x.shape

train_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)

train_mask[train_index] = True
test_mask[test_index] = True
data['train_mask'] = train_mask
data['test_mask'] = test_mask

Training set length: 284748
Test set length: 71188


## Train, Val, Test split

In [61]:
index_list = data.edge_index.flatten().unique().tolist()

# Porcentaje de índices para cada conjunto
train_percentage = 0.8
val_percentage = 0.1

# Dividir en conjuntos de entrenamiento y prueba
train_index, remaining_index = train_test_split(index_list, train_size=train_percentage, random_state=42)

# Dividir el resto en conjuntos de validación y prueba
val_index, test_index = train_test_split(remaining_index, train_size=val_percentage / (1 - train_percentage), random_state=42)

print("Training set length:", len(train_index))
print("Validation set length:", len(val_index))
print("Test set length:", len(test_index))

n_nodes, n_features = data.x.shape

train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)

train_mask[train_index] = True
val_mask[val_index] = True
test_mask[test_index] = True
data['train_mask'] = train_mask
data['val_mask'] = val_mask
data['test_mask'] = test_mask

Training set length: 284748
Validation set length: 35594
Test set length: 35594


In [62]:
data_tensor = torch.cat((data.x, data.y.unsqueeze(1), data.train_mask.unsqueeze(1), data.test_mask.unsqueeze(1)), dim=1)
df_ismt = pd.DataFrame(data_tensor.numpy(), columns=['beautiful','boring','depressing','lively','safe','wealthy',
                                                     'pct_hog40p', 'train', 'test'])

In [63]:
train_split = df_ismt[df_ismt.train == 1]
test_split = df_ismt[df_ismt.test == 1]

X_train, y_train = train_split[['beautiful','boring','depressing','lively','safe','wealthy']], train_split.pct_hog40p
X_test, y_test = test_split[['beautiful','boring','depressing','lively','safe','wealthy']], test_split.pct_hog40p

## Regression Models

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

mse_error = mean_squared_error(y_test, y_pred)
mae_error = mean_absolute_error(y_test, y_pred)
r2_score_error = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_error:.4f}")
print(f"Mean Absolute Error: {mae_error:.4f}")
print(f"R2 Score: {r2_score_error:.4f}")

Mean Squared Error: 0.0564
Mean Absolute Error: 0.2046
R2 Score: 0.1067


In [67]:
%%time
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

mse_error = mean_squared_error(y_test, y_pred)
mae_error = mean_absolute_error(y_test, y_pred)
r2_score_error = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_error:.4f}")
print(f"Mean Absolute Error: {mae_error:.4f}")
print(f"R2 Score: {r2_score_error:.4f}")

Mean Squared Error: 0.0694
Mean Absolute Error: 0.1779
R2 Score: -0.0985
CPU times: total: 3.38 s
Wall time: 8.75 s
