### Imports

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np 
from scipy.stats import norm
from sklearn.manifold import MDS
import plotly.express as px
import torch 
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from torch.nn import init
from torch.optim.lr_scheduler import StepLR

### Code

In [None]:
df = pd.read_csv('adj-matrix-US.csv', lineterminator='\n', low_memory=False).iloc[0:]
adj = {}

for index, row in df.iterrows():
    key = row.iloc[0]
    values = row.iloc[1:].tolist()
    adj[key] = values
    
matrix = {}
for column in df.columns:
    if column != 'Unnamed: 0':
        matrix[column] = df[column].tolist()

In [None]:
# accountNames = df.columns.str.strip('\r')[1:]
# encoder = OneHotEncoder()
# accountNames = np.array(accountNames).reshape(-1,1)
# encodedNames = encoder.fit_transform(accountNames)



In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.accountList = list(data.keys())
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        account = self.accountList[idx]
        followers = np.array(self.data[account])
        
        #One Hot Encoding
        input_account = torch.zeros(len(self.accountList)) 
        input_account[self.accountList.index(account)] = 1

        return torch.tensor(input_account), torch.tensor(followers)

In [None]:
class secondOption(nn.Module):
    def __init__(self, input_dimension, emb_dimension, hidden_dimension, output_dimension):
        super(secondOption,  self).__init__()
        
        self.input_dimension = input_dimension
        self.emb_dimension = emb_dimension
        self.hidden_dimension = hidden_dimension
        
        # self.u_embeddings = nn.Linear(input_dimension, input_dimension)
        self.u_embeddings = nn.Embedding(input_dimension, 3)
        self.hidden_layer = nn.Linear(3, input_dimension)
        self.output_layer = nn.Linear(input_dimension, output_dimension)
        
        initrange = 1.0 / self.emb_dimension
        
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        nn.init.xavier_uniform_(self.hidden_layer.weight.data)
        nn.init.xavier_uniform_(self.output_layer.weight.data)
        
    def forward(self, x):
        emb_center = self.u_embeddings(x)
        # print(emb_center.shape)
        emb_center = torch.mean(emb_center, dim = 1)
        hidden = torch.relu(self.hidden_layer(emb_center))
        output = self.output_layer(hidden)
        
        return torch.sigmoid(output)


In [None]:
class OpinionMeasurementModel(nn.Module):
    def __init__(self, input_dimension, emb_dimension, hidden_dimension, output_dimension):
        super(OpinionMeasurementModel, self).__init__()
        
        self.input_dimension = input_dimension
        self.emb_dimension = emb_dimension
        self.hidden_dimension = hidden_dimension
        
        self.u_embeddings = nn.Linear(input_dimension, input_dimension)
        self.hidden_layer = nn.Linear(input_dimension, hidden_dimension)
        self.output_layer = nn.Linear(hidden_dimension, output_dimension)
        
        initrange = 1.0 / self.emb_dimension
        
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        nn.init.xavier_uniform_(self.hidden_layer.weight.data)
        nn.init.xavier_uniform_(self.output_layer.weight.data)
        
    def forward(self, x):
        emb_center = self.u_embeddings(x)
        # emb_center = torch.mean(emb_center, dim = 1)
        hidden = torch.relu(self.hidden_layer(emb_center))
        output = self.output_layer(hidden)
        
        return torch.sigmoid(output)

device = torch.device('cpu')
dataset = Dataset(matrix)

batch_size = 1000
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

accountNum = len(df.columns[1:])
userNum = len(df['Unnamed: 0'])
# model = OpinionMeasurementModel(accountNum, 128, 256, userNum)
model = secondOption(accountNum, 128, 256, userNum)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.05)

scheduler_step_size = 10
scheduler_gamma = 0.1
scheduler  = StepLR(optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)


In [None]:
num_epochs = 50
for epoch in range(num_epochs):
    total_loss = 0.0
    
    for account, followers in data_loader:

        
        optimizer.zero_grad()
        account = account.int()
        outputs = model(account)
        # print(outputs.float(), followers.float())
        loss = criterion(outputs, followers.float())
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    scheduler.step()
        
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')

In [1]:
model_path = 'wang_model_model.pth'
torch.save(model.state_dict(), model_path)



NameError: name 'torch' is not defined

In [None]:
model.load_state_dict(torch.load(model_path))

In [None]:
from sklearn.preprocessing import StandardScaler

def extract_embeddings(model):
    embeddings = model.hidden_layer.weight.data.numpy()
    # embeddings = model.u_embeddings.weight.data.numpy()

    return embeddings

def pca(embeddings):
    pca = PCA(n_components=2)
                                                           
    embeddings_2d = pca.fit_transform(embeddings)
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
    plt.title('Embeddings Visualization using PCA')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.show()

embeddings = extract_embeddings(model)
pca(embeddings)
    

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

def extract_embeddings(model):
    embeddings = model.u_embeddings.weight.data.numpy()
    return embeddings

def pca(embeddings):
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)
    
    # Scale the dimensions between 0 and 1
    scaler = MinMaxScaler()
    embeddings_2d_scaled = scaler.fit_transform(embeddings_2d)
    
    plt.scatter(embeddings_2d_scaled[:, 0], embeddings_2d_scaled[:, 1])
    plt.title('Embeddings Visualization using PCA (Scaled)')
    plt.xlabel('PC1 (Scaled)')
    plt.ylabel('PC2 (Scaled)')
    plt.show()

embeddings = extract_embeddings(model)
pca(embeddings)


In [None]:
from sklearn.manifold import MDS

def mds(embeddings):
    mds = MDS(n_components=2, dissimilarity= 'euclidean')
    embeddings_2d = mds.fit_transform(embeddings)
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
    plt.title('Embeddings Visualization using MDS')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()

embeddings = extract_embeddings(model)
mds(embeddings)


In [None]:
print(len(embeddings))

In [None]:

def tsne(embeddings):
    embeddings = StandardScaler().fit_transform(embeddings)
    tsne = TSNE(n_components=2)
    embeddings_2d = tsne.fit_transform(embeddings)
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
    plt.title('Embeddings Visualization using t-SNE')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()

embeddings = extract_embeddings(model)
tsne(embeddings)
