In [None]:
import laserembeddings
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import langdetect
from src.Util import *
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterGrid
import numpy as np
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

!python -m laserembeddings download-models
laser=laserembeddings.Laser()

# set accordingly
DEV_MODE = True

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU.')
    device = torch.device("cpu")

In [None]:
def determine_tweet_lang(x):
    try:
      return_val = langdetect.detect(x)
    except langdetect.lang_detect_exception.LangDetectException:
      # language not detectable from text (e.g., just numbers given, etc.)
      # the exact language is not important (just used for tokenization), we can assume it was English
      return_val='en'
    except:
      #catches any kind of error that makes it impossible to detect language
      return_val=None

    return return_val

def prepare_LASER_dataset(data, inplace=True):
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
  transform_labels(data)

  decoded_tweet_text=data['text_tokens']\
      .apply(lambda text_tokens: "".join(tokenizer.
                                         convert_tokens_to_string(
      tokenizer.convert_ids_to_tokens(text_tokens.split('\t')))).replace('[CLS]','').replace('[SEP]',''))\
      .astype(str)

  lang=decoded_tweet_text.apply(lambda x: determine_tweet_lang(x))\
      .apply(lambda x: x.replace('-cn','').replace('-tw','')).astype(str)


  laser_embedding=pd.concat([decoded_tweet_text, lang], axis =1)\
      .apply(lambda row:laser.embed_sentences(row[0],row[1]),axis=1)\
      .apply(lambda x:x.flatten())

  if not inplace:
    data = copy.deepcopy(data)

  data['laser_embedding']=laser_embedding
  data['lang']=lang

  return data

def scatterplot_tsne(tsne_embeddings, target_labels, title):
  plt.figure(figsize=(10, 10))
  plt.title(title)
  colors = ['r', 'g']
  for i in range(len(tsne_embeddings)):
      plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1], c=colors[target_labels[i]])
  plt.show()

In [None]:
train_path=''
test_path=''
print('Reading data.')
train_data = pd.read_csv(train_path,sep='\x01',encoding = 'utf-8',names=CONTENT_BASED_COLUMNS+LABELS,
                         header=None,usecols=CONTENT_BASED_COLUMNS + LABELS)
test_data = pd.read_csv(test_path,sep='\x01',encoding = 'utf-8',names=CONTENT_BASED_COLUMNS+LABELS,
                        header=None,usecols= CONTENT_BASED_COLUMNS + LABELS)

if DEV_MODE:
  print('Using reduced samples.')
  train_data=train_data.sample(10000)
  test_data=test_data.sample(10000)

print('Preparing train and test data.')
train_data=prepare_LASER_dataset(train_data)
print('Done preparing train data.')
test_data=prepare_LASER_dataset(test_data)
print('Done')

Laser (https://github.com/facebookresearch/LASER) is short for language agnostic sentence representations and can be used to generate embeddings for entire phrases that conserve semantic meaning across different languages.
In the below example, the different statements about food preferences are at a small distance to each other in vector space, thus representing their semantic similarity. The other phrase, expressing a preference over animals instead, is much further away.

In [None]:
sentences= ['I love pasta.',"J'adore les pâtes.",'Ich liebe Pasta.','Amo le paste.','Dogs are better than cats.']
langs=['en', 'fr', 'de','it','en'] 
embeddings = laser.embed_sentences(sentences,langs)
print(f'The shape of the embeddings {embeddings.shape}')

#illustrates langdetect
detected_languages=[]
for sentence in sentences:
    # Note that even though language detection on short phrases is far from optimal, this does not matter as LASER only
    # needs the information for tokenization (i.e., language AGNOSTIC) and tokenization should not be overly affected.
    detected_languages.append(langdetect.detect(sentence))
dist = [[ np.linalg.norm(a-b) for b in embeddings] for a in embeddings]

index = sentences
columns = sentences
df = pd.DataFrame(dist, index=index, columns=columns)
plt.pcolor(df)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns,rotation=90)
plt.colorbar()
plt.show()


We now embed the different Tweets using LASER and build algorithms to classify engagement probabilities on them.
The intuition is that these algorithms will be able to use the overall semantic meaning encoded in the embeddings to detect which Tweets are more likley to foster certain types of interaction.


In [None]:
knn_grid = {'n_neighbors':[10, 100, 1000]}

for label in LABELS:
  print(f'Current label: {label}')
  for params in ParameterGrid(knn_grid):
    print(params)
    knn_cls = KNeighborsClassifier(n_neighbors=params['n_neighbors'])
    knn_cls.fit(train_data['laser_embedding'].tolist(),train_data[label].tolist())
    predictions = knn_cls.predict(test_data['laser_embedding'].tolist())
    compute_all_metrics(predictions, test_data[label].tolist())

In [None]:
laser_embeddings=torch.tensor(train_data['laser_embedding'].tolist())
labels=torch.tensor(train_data[LABELS].values.tolist())
train_dataset=TensorDataset(laser_embeddings, labels)
laser_embeddings_test=torch.tensor(test_data['laser_embedding'].tolist())
labels_test=torch.tensor(test_data[LABELS].values.tolist())
test_dataset=TensorDataset(laser_embeddings_test,labels_test)

We build a small neural network to classify on top of these embeddings.


In [None]:
class Net(torch.nn.Module):

    def __init__(self,input_size, hidden_size1, hidden_size2,hidden_size3, output_size):
        super(Net, self).__init__()
        #1024 dimensional inputs
        self.fc0   = torch.nn.Linear(input_size,hidden_size1)
        self.relu1 = torch.nn.LeakyReLU()
        self.fc1   = torch.nn.Linear(hidden_size1,hidden_size2)  
        self.relu2 = torch.nn.LeakyReLU()
        self.fc2   = torch.nn.Linear(hidden_size2,hidden_size3)
        self.relu3 = torch.nn.LeakyReLU()
        self.fc3   = torch.nn.Linear(hidden_size3,output_size)
        self.sig = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(self.relu1(self.fc0(x)))
        x = self.fc2(self.relu2(x))
        x = self.fc3(self.relu3(x))
        x = self.sig(x)
        return x


model = Net(1024,256,64,16,4)
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
print(model)

We will see that the model merely learns the average probability for each class, thus effectively reaching RCE scores of 0 as it matches the naive baseline.

In [None]:
model.train()
epoch = 20

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=False)

print('Training started.')
for epoch in range(epoch): 
  for step, batch in enumerate(train_loader):

    optimizer.zero_grad()
    laser_embedding_tensor=batch[0].to(device).float()
    label_tensor=batch[1].to(device).float()
    y_pred = model(laser_embedding_tensor)   # Compute Loss
    loss = criterion(y_pred.squeeze(), label_tensor)
   
    
    loss.backward()
    optimizer.step()
  print('Epoch {}: train loss: {}'.format(epoch, loss.item()))    # Backward pass

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

#performing test run

total_predictions=[]
for step, batch in enumerate(test_dataloader):

    #puts the optimizer into evaluation mode as we are not training it anymore
    model.eval()

    optimizer.zero_grad()
    laser_embedding_tensor=batch[0].to(device).float()
    label_tensor=batch[1].to(device).float()

    with torch.no_grad():
      y_pred = model(laser_embedding_tensor)   # Compute Loss
    total_predictions=total_predictions+[y_pred]


Since the obtained results with different methods do not look promising, we generate T-SNE embeddings of the datapoints with respect to their class hoping to determine a relationship between class membership and embedding vector. We use varying levels of perplexity, as recommended. There is no clear relationship between embeddings and class membership visible in the visualizations.

In [None]:
# randomly sample 1000 datapoints and generate T-SNE visualizations
tsne_data=train_data.sample(1000)
for label in LABELS:
    for i in range(0,51, 10):
      tsne=TSNE(n_components=2,perplexity=i,random_state=0)
      transformed_train_embed=tsne.fit_transform(np.array(tsne_data['laser_embedding'].tolist()))
      scatterplot_tsne(transformed_train_embed,tsne_data[label].to_numpy(), title=f'Label {label} Perplexity {i}')
