In [None]:
import torch
import pandas as pd
import numpy as np
import sys

sys.path.insert(0, "../pipeline/")
import preprocessing
import utils
import matplotlib.pyplot as plt
import json

from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from gcn import GCNModel
from torch_geometric.data import Data
from matplotlib import rcParams

In [None]:
torch.manual_seed(0)
np.random.seed(0)

profiles = pd.read_csv("../data/new_profiles_200t.csv")
comments = pd.read_csv("../data/new_comments_200t.csv")

comments = comments.drop_duplicates()
profiles = preprocessing.categorical_to_numerical(profiles, col="category_1")
all_users = set(profiles.profile_username.values)

data = preprocessing.scale(profiles.drop(columns=["category_1", "profile_username"]).values)
name_to_record = {name: record for name, record in zip(all_users, data)}

input_dim, output_dim = data.shape[1], len(profiles.category_1.unique()) + 1
user_to_label = {user: category for user, category in profiles[["profile_username", "category_1"]].values}

In [None]:
authors = profiles.profile_username.values

username_to_index = utils.get_users_indices(authors)
interactions = utils.get_interactions(comments[comments.media_author.isin(authors) & comments.commenter.isin(authors)], username_to_index)
x, y = utils.get_x(authors, name_to_record, input_dim=input_dim), utils.get_y(user_to_label, authors)

edge_index = utils.get_edge_index(interactions)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = Data(x=x, y=y, edge_index=edge_index).to(device)

gcn = GCNModel(x.shape[1], 64, 5, lr=0.005, n_hidden_layers=2)
history = gcn.fit(data, epochs=100)

embeddings = gcn.forward(data.x, data.edge_index, apply_activation=False)

In [None]:
category1 = profiles[profiles.category_1 == 1].index.values
category2 = profiles[profiles.category_1 == 2].index.values
category3 = profiles[profiles.category_1 == 3].index.values
category4 = profiles[profiles.category_1 == 4].index.values

group_indices = [category1, category2, category3, category4]

In [None]:
def reduce_data(data, n_dim=2):
    return PCA(n_components=n_dim, random_state=0).fit_transform(data)


def plot_2d(data, group_indices, legends):
    for group in group_indices:
        plt.scatter(data[group][:, :1], data[group][:, 1:])
    plt.legend(legends)
    plt.show()
    

def plot_3d(data, group_indices, legends):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    for group in group_indices:
        ax.scatter(data[group][:, :1], data[group][:, 1:2], data[group][:, 2:])
    plt.legend(legends)
    plt.show()

In [None]:
data = reduce_data(embeddings.detach().numpy(), n_dim=2)
plot_2d(data, group_indices, ["Politics", "Sport", "Music", "Show"])

In [None]:
data = reduce_data(embeddings.detach().numpy(), n_dim=3)
plot_3d(data, group_indices, ["Politics", "Sport", "Music", "Show"])

In [None]:
data = preprocessing.scale(profiles.drop(columns=["category_1", "profile_username"]).values)
data = reduce_data(data, n_dim=2)
plot_2d(data, group_indices, ["Politics", "Sport", "Music", "Show"])

In [None]:
data = preprocessing.scale(profiles.drop(columns=["category_1", "profile_username"]).values)
data = reduce_data(data, n_dim=3)
plot_3d(data, group_indices, ["Politics", "Sport", "Music", "Show"])

In [None]:
rcParams["figure.figsize"] = (12, 6)
category_to_numerical = {"Unknown Users": 0, "Politics": 1, "Sport": 2, "Music": 3, "Show": 4}

df_values = profiles.drop(["profile_username"], axis=1)
df_category = df_values[df_values.category_1 == category_to_numerical["Unknown Users"]]
df_category = pd.DataFrame(preprocessing.scale(df_category.values), columns=df_category.columns)\
                    .drop(["category_1", "is_tracked"], axis=1)

x = df_category.columns
y = []
for col in x:
    y.append(np.mean(df_category[col].values))

plt.bar(x, y)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
plt.title("Unknown Users", fontsize=16)
plt.show()

In [None]:
df_values = profiles.drop(["profile_username"], axis=1)
fig, axes = plt.subplots(2, 2, sharey=True, figsize=(14, 7))
for i, category in enumerate(["Politics", "Sport", "Music", "Show"]):    
    df_category = df_values[df_values.category_1 == category_to_numerical[category]]
    df_category = pd.DataFrame(preprocessing.scale(df_category.values), columns=df_category.columns)\
                        .drop(["category_1", "is_tracked"], axis=1)

    x = df_category.columns
    y = []
    for col in x:
        y.append(np.mean(df_category[col].values))
    
    axes[i//2, i%2].bar(x, y)
    axes[i//2, i%2].set_title(category)
    
    if i//2 != 1:
        axes[i//2, i%2].set_xticks([])
    for tick in axes[i//2, i%2].get_xticklabels():
        tick.set_rotation(90)

plt.show()

In [None]:
legends = ["Unknown Users", "Politics", "Sport", "Music", "Show"]
values = profiles[["category_1", "profile_username"]].groupby("category_1").count().values

plt.pie(values, labels=legends, autopct="%.2f", startangle=45, explode=[0, 0.1, 0.15, 0.2, 0.25])
plt.title("Percentage per Category", fontsize=16)
rcParams["font.size"] = 14
plt.show()

In [None]:
gcn_trace = json.load(open("../data/results/histories/models_histories_100e_2l_64u.json"))["GCNModel"]

In [None]:
plt.plot(range(len(gcn_trace)), gcn_trace)
plt.title("Best Model Training Error")
plt.show()