In [77]:
# Graph Convolutional networks for Tweet archetype classification
# Maciej Wójcik

# Dependencies
import tensorflow as tf
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import re
import numpy as np
from collections import OrderedDict
from itertools import combinations
from tqdm import tqdm
import math
import networkx as nx
import pickle

def save_to_pickle(obj, file_name):
    with open(file_name, "wb") as f:
        pickle.dump(obj, f)

def fcn_stub(stub):
    return stub

# Create a stemmer
stemmer = SnowballStemmer("english")

# Function for stemming and lemmatization
def stem_and_lemmatize(text:str) -> str:
    """Stems and lemmatizes a given text."""
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_texts(text_list: pd.DataFrame) -> pd.DataFrame:
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        full_tweet = ''
        for word in x.split():
            word = stem_and_lemmatize(word)
            full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

def filter_tokens(tokens):
    tokens1 = []
    for token in tokens:
        if (token not in [".",",",";","&","'s", ":", "?", "!","(",")",\
            "'","'m","'no","***","--","...","[","]", " "]):
            tokens1.append(token)
    return tokens1

def word_word_edges(p_ij):
    word_word = []
    cols = list(p_ij.columns)
    cols = [str(w) for w in cols]
    
    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
    return word_word

def nCr(n,r):
    f = math.factorial
    return int(f(n)/(f(r)*f(n-r)))

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [78]:
# Read and process dataset
text_df = pd.read_csv('twitter_database.csv')
text_df.head()

Unnamed: 0.1,Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype
0,0,5f9f1c36b38e10f823bf2cef,"@eliostruyf So exciting, have fun! 😊",LEGO_Group,2020-10-30 18:23:50.000,,artist
1,1,5f9f1c36b38e10f823bf2ce7,These Brick-O-Lanterns are certainly all treat...,LEGO_Group,2020-10-31 09:00:28.000,,artist
2,2,5f9f1c36b38e10f823bf2d0a,@dentistescabri Nous prenons la sécurité de no...,LEGO_Group,2020-10-30 12:07:58.000,,artist
3,3,5f9f1c36b38e10f823bf2cf5,@Jasmin80212446 😍🎄🥰,LEGO_Group,2020-10-30 16:35:39.000,,artist
4,4,5f9f1c36b38e10f823bf2d07,@ashleydrixey Sounds like a perfect fit for th...,LEGO_Group,2020-10-30 13:09:14.000,,artist


In [79]:
# Preprocess text and drop empty fields
text_df = preprocess_texts(text_df)
text_df = text_df.groupby('archetype').head(1000)
save_to_pickle(text_df, "unprocessed_tweets_df.pickle")
print(len(text_df))

12000


In [88]:
# Tokenize the words
df_ta = pd.DataFrame(columns=["processed_tweet", "archetype"])
for arch in text_df["archetype"].unique():
    dummy = pd.DataFrame(columns=["processed_tweet", "archetype"])
    dummy["processed_tweet"] = text_df[text_df["archetype"] == arch].groupby("archetype").apply(lambda x: (" ".join(x["processed_tweet"])).lower())
    dummy["archetype"] = arch
    df_ta = pd.concat([df_ta, dummy], ignore_index=True)

# Tokenize the dataframe
df_ta['processed_tweet'] = df_ta['processed_tweet'].apply(lambda x: nltk.word_tokenize(x)).apply(lambda x: filter_tokens(x))

# Data vectorization
vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=fcn_stub, preprocessor=fcn_stub)
vectorizer.fit(df_ta['processed_tweet'])
df_tfidf = vectorizer.transform(df_ta['processed_tweet'])
df_tfidf = df_tfidf.toarray()

# Get feature names
vocab = vectorizer.get_feature_names()
vocab = np.array(vocab)

df_tfidf = pd.DataFrame(df_tfidf, columns=vocab)
df_tfidf.head()



Unnamed: 0,a,aa,aah,aarchi,aaron,aayush,ab,abandon,abbey,abbi,...,zombi,zone,zoom,zu,zukunft,zukunftssicherung,zum,zur,zwei,zyciora
0,0.199264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.004686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.177881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.209317,0.0,0.0,0.0,0.002159,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.292803,0.0,0.0,0.0,0.000857,0.0,0.0,0.007835,0.0,0.0,...,0.001306,0.000939,0.000939,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.221512,0.0,0.0,0.001365,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Calculate PMI between words
names = vocab
name_idx = OrderedDict((name, 0) for name in names)
word_to_index = OrderedDict((name, index) for index, name in enumerate(names))

# Get the co-occurrences
occurrences = np.zeros((len(names), len(names)), dtype=np.int32)
windows_count = 0
window = 10             # Sliding window size, for calculation PMI between words 
for l in tqdm(df_ta['processed_tweet'], total=len(df_ta['processed_tweet'])):
    for i in range(len(l) - window):
        windows_count += 1
        d = set(l[i:(i+window)])
        for w in d:
            name_idx[w] += 1
        for w1, w2 in combinations(d, 2):
            i1 = word_to_index[w1]
            i2 = word_to_index[w2]
            
            occurrences[i1][i2] = 1
            occurrences[i2][i1] = 1
            
# Convert the occurences to PMI
pmi_per_word = pd.DataFrame(occurrences, index=names, columns=names) / windows_count
pmi_index = pd.Series(name_idx, index=name_idx.keys()) / windows_count

# Free memory
del occurrences
del name_idx

for col in tqdm(pmi_per_word.columns):
    pmi_per_word[col] = pmi_per_word[col]/pmi_index[col]

for row in tqdm(pmi_per_word.index):
    pmi_per_word.loc[row, :] = pmi_per_word.loc[row, :] / pmi_index[row]
    
pmi_per_word = pmi_per_word + 1E-9
for col in tqdm(pmi_per_word.columns):
    pmi_per_word[col] = pmi_per_word[col].apply(lambda x: math.log(x))

100%|██████████| 12/12 [00:09<00:00,  1.31it/s]
100%|██████████| 11727/11727 [00:03<00:00, 3876.73it/s]
100%|██████████| 11727/11727 [00:05<00:00, 2322.67it/s]
100%|██████████| 11727/11727 [00:43<00:00, 267.74it/s]


In [6]:
# Build a graph
graph = nx.Graph()
graph.add_nodes_from(df_tfidf.index)
graph.add_nodes_from(vocab)

# Build document-word edges
document_word = [(doc,w,{"weight":df_tfidf.loc[doc,w]}) for doc in tqdm(df_tfidf.index, total=len(df_tfidf.index))\
                     for w in df_tfidf.columns]

word_word = word_word_edges(pmi_per_word)
graph.add_edges_from(document_word)
graph.add_edges_from(word_word)

100%|██████████| 12/12 [00:01<00:00,  9.07it/s]
100%|██████████| 68755401/68755401 [07:06<00:00, 161123.87it/s]


In [7]:
# Export every needed structure
save_to_pickle(graph, "text_graph.pickle")
save_to_pickle(df_ta, "tweet_archetype_df.pickle")

In [4]:
# Read data
with open('text_graph.pickle', "rb") as f:
    graph = pickle.load(f)
print("Graph loaded.")

Graph loaded.


In [91]:
# GCN - implementation and training
# Create A matrix and hat_A
A = nx.to_numpy_matrix(graph, weight="weight")
A = A + np.eye(graph.number_of_nodes())

degs = []
for deg in tqdm(graph.degree(weight=None)):
    if deg == 0:
        degs.append(0)
    else:
        degs.append(deg[1]**(-0.5))
degs = np.diag(degs)
X = np.eye(graph.number_of_nodes())
hat_A = np.matmul(np.matmul(degs, A), degs)
inp = X  # Net input

100%|██████████| 11739/11739 [00:00<00:00, 1212344.19it/s]


In [92]:
# Load the tweet pickle
with open('unprocessed_tweets_df.pickle', "rb") as f:
    df_tweet = pickle.load(f)

archetype_dict = {'archetype': 
                  {'artist': 0,
                  'caregiver': 1,
                  'everyman': 2,
                  'explorer': 3,
                  'guru': 4,
                  'hero': 5,
                  'innocent': 6,
                  'jester': 7,
                  'magician': 8,
                  'rebel': 9,
                  'ruler': 10,
                  'seducer':11}
                 }

df_tweet = df_tweet.replace(archetype_dict)
df_tweet = df_tweet.reset_index()

# Split the testing dataset
test_indices = []
for arch in tqdm(df_tweet["archetype"].unique()):
    tmp = df_tweet[df_tweet["archetype"] == arch]
    if len(tmp) >= 4:
        test_indices.extend(list(np.random.choice(tmp.index, size=round(0.1*len(tmp)), replace=False)))
print(f"Finished processing test indices: {test_indices}")

selected = []
for i in tqdm(range(len(df_ta))):
    if i not in test_indices:
        selected.append(i)
print("Finished selecting.")

100%|██████████| 12/12 [00:00<00:00, 1357.01it/s]
100%|██████████| 12/12 [00:00<00:00, 5919.28it/s]

Finished processing test indices: [74, 276, 107, 77, 401, 277, 251, 60, 430, 64, 249, 397, 38, 364, 279, 150, 682, 53, 735, 325, 373, 82, 659, 889, 178, 317, 623, 293, 925, 261, 318, 284, 597, 181, 697, 742, 470, 760, 479, 780, 749, 36, 784, 964, 974, 591, 468, 437, 492, 907, 176, 956, 687, 242, 126, 639, 387, 834, 835, 606, 485, 624, 647, 572, 620, 368, 848, 575, 13, 265, 714, 63, 928, 149, 795, 840, 455, 350, 340, 102, 940, 695, 666, 290, 844, 573, 405, 862, 264, 998, 708, 415, 577, 151, 797, 6, 333, 754, 203, 534, 1220, 1170, 1649, 1056, 1322, 1400, 1114, 1871, 1023, 1415, 1043, 1492, 1353, 1570, 1629, 1088, 1812, 1384, 1793, 1759, 1115, 1943, 1729, 1228, 1364, 1351, 1836, 1069, 1905, 1583, 1440, 1889, 1464, 1621, 1097, 1965, 1226, 1388, 1222, 1772, 1631, 1435, 1514, 1398, 1008, 1489, 1540, 1021, 1936, 1925, 1562, 1561, 1206, 1816, 1521, 1880, 1994, 1268, 1212, 1834, 1070, 1252, 1896, 1238, 1337, 1901, 1437, 1617, 1754, 1102, 1169, 1090, 1289, 1153, 1959, 1062, 1283, 1452, 1541, 118




In [93]:
# Save test indices and seleced ones
save_to_pickle(test_indices, "test_indices.pickle")
save_to_pickle(selected, "selected.pickle")

import torch
import torch.nn as nn
import torch.nn.functional as F

# Operations on selected inputs
inp_selected = inp[selected]
inp_selected = torch.from_numpy(inp_selected).float()
inp_selected = torch.tensor(inp_selected, device=torch.device('cuda'))
labels_selected = [l for idx, l in enumerate(df_tweet["archetype"]) if idx in selected]
inp_not_selected = inp[test_indices]
inp_not_selected = torch.from_numpy(inp_not_selected).float()
labels_not_selected = [l for idx, l in enumerate(df_tweet["archetype"]) if idx not in selected]
inp = torch.from_numpy(inp).float()

  inp_selected = torch.tensor(inp_selected, device=torch.device('cuda'))


IndexError: index 11742 is out of bounds for axis 0 with size 11739

In [70]:
# Model creation
device = torch.device('cuda:0')

class GCN(nn.Module):
    def __init__(self, X_size, A_hat, args, bias=True): # X_size = num features
        super(GCN, self).__init__()
        self.A_hat = torch.tensor(A_hat, requires_grad=False, device=device).float()
        self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args['hidden_size_1']))
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args['hidden_size_1'], args['hidden_size_2']))
        var2 = 2./(self.weight2.size(1)+self.weight2.size(0))
        self.weight2.data.normal_(0,var2)
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(args['hidden_size_1']))
            self.bias.data.normal_(0,var)
            self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args['hidden_size_2']))
            self.bias2.data.normal_(0,var2)
        else:
            self.register_parameter("bias", None)
        self.fc1 = nn.Linear(args['hidden_size_2'], args['num_classes'])
        
    def forward(self, X): ### 2-layer GCN architecture
        X = torch.mm(X, self.weight)
        if self.bias is not None:
            X = (X + self.bias)
        X = F.relu(torch.mm(self.A_hat, X))
        X = torch.mm(X, self.weight2)
        if self.bias2 is not None:
            X = (X + self.bias2)
        X = F.relu(torch.mm(self.A_hat, X))
        return self.fc1(X)
    
def evaluate(output, labels_e):
    _, labels = output.max(1); labels = labels.numpy()
    return sum([(e-1) for e in labels_e] == labels)/len(labels)

In [71]:
# Define additional arguments
args = {
    'hidden_size_2': 130, 
    'num_classes': 12, 
    'hidden_size_1': 330
}

net = GCN(X.shape[1], hat_A, args).to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77)

In [72]:
losses_per_epoch, accuracy_per_epoch = [], []
evaluation_trained = []
best_pred = 0.0
import os

for e in range(1000):
        optimizer.zero_grad()
        inp_selected = inp_selected.to(device)
        output = net(inp_selected)
        loss = criterion(output[selected], torch.tensor(labels_selected).long())
        losses_per_epoch.append(loss.item())
        loss.backward()
        optimizer.step()
        if e % 50 == 0:
            ### Evaluate other untrained nodes and check accuracy of labelling
            net.eval()
            with torch.no_grad():
                pred_labels = net(inp_selected)
                trained_accuracy = evaluate(output[selected], labels_selected); 
                #untrained_accuracy = evaluate(pred_labels[test_indices], labels_not_selected)
            evaluation_trained.append((e, trained_accuracy))
            #evaluation_untrained.append((e, untrained_accuracy))
            print("[Epoch %d]: Evaluation accuracy of trained nodes: %.7f" % (e, trained_accuracy))
            #print("[Epoch %d]: Evaluation accuracy of test nodes: %.7f" % (e, untrained_accuracy))
            print("Labels of trained nodes: \n", output[selected].max(1)[1])
            net.train()
            if trained_accuracy > best_pred:
                best_pred = trained_accuracy
                torch.save({
                    'epoch': e + 1,\
                    'state_dict': net.state_dict(),\
                    'best_acc': trained_accuracy,\
                    'optimizer' : optimizer.state_dict(),\
                    'scheduler' : scheduler.state_dict(),\
                }, os.path.join("./data/" ,\
                    "test_model_best_%d.pth.tar" % e))
        if (e % 250) == 0:
            save_to_pickle(losses_per_epoch, "test_losses_per_epoch_%d.pkl" % e)
            #save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_untrained)
            torch.save({
                    'epoch': e + 1,\
                    'state_dict': net.state_dict(),\
                    'best_acc': trained_accuracy,\
                    'optimizer' : optimizer.state_dict(),\
                    'scheduler' : scheduler.state_dict(),\
                }, os.path.join("./data/",\
                    "test_checkpoint_%d.pth.tar" % e))
        scheduler.step()
    

RuntimeError: mat1 dim 1 must match mat2 dim 0