#Introduction

.....


In [None]:
!which python

Check to see if jupyter lab uses the correct python interpreter with '!which python'.
It should be something like '/opt/anaconda3/envs/[environment name]/bin/python' (on Mac).
If not, try this: https://github.com/jupyter/notebook/issues/3146#issuecomment-352718675

# Install dependencies:

In [None]:
install_packages = True
if install_packages:
    !conda install tensorflow=2 -y
    !conda install -c anaconda pandas -y
    !conda install -c conda-forge tensorflow-hub -y
    !conda install -c akode html2text -y
    !conda install -c conda-forge tqdm -y
    !conda install -c anaconda scikit-learn -y
    !conda install -c conda-forge matplotlib -y
    !conda install -c anaconda seaborn -y

# Imports

In [1]:
#imports
import pandas as pd
import numpy as np
import os
import time
import tensorflow as tf
import tensorflow_hub as hub
import zipfile
from html2text import HTML2Text
from tqdm import tqdm
import re
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import seaborn as sns

# Set pandas print options
This will improve readability of printed pandas dataframe.


In [8]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Set global Parameters
Set your parameters here:

data_path: In this path put the data you have downloaded with YouTube Data Tools. 
output_path: Tghe files generated in this notebook will be saved here.

url_dict: URLs to models on Tensorflow hub are saved here. Other models are available there.
model_type: Define which model you would like to use. Choose one from url_dict

new_embeddings: If this is true, new embeddings will be generated and saved at output_path. Otherwise, embeddings are loaded from Disc.




In [10]:
data_path = '/home/daniel/uni/semester2/critical-social-media-analysis/data/comments.tab'
output_path = "./output/"

new_embeddings = True

url_dict = {
            'Transformer' : "https://tfhub.dev/google/universal-sentence-encoder-large/5",
            'DAN' : "https://tfhub.dev/google/universal-sentence-encoder/4",
            'Transformer_Multilingual': "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
}

model_type = 'Transformer' #@param ['DAN','Transformer','Transformer_Multilingual']

## Create output directory
Try to create the directory defined by output_path

In [12]:
try:
   os.mkdir(output_path)
except OSError:
  print ("Creation of the directory %s failed" % output_path)
else:
  print ("Successfully created the directory %s " % output_path)


Creation of the directory ./output/ failed


# Load Data
Load you data as a pandas dataframe

In [5]:
if new_embeddings:
    data = pd.read_csv(data_path,sep='\t',header=(0))
    data.head()

# Preprocessing
Preprocess your data:
- Drop empty rows
- Drop unused columns

In [6]:
if new_embeddings:
    data = data.dropna(subset=['text', 'authorName']) # drop rows with no content
    data=data.drop(['id', 'replyCount','likeCount','authorChannelUrl','authorChannelId','isReplyTo','isReplyToName'],axis=1) # drop unused columns
    data.head()

- remove HTML-tags, links and usernames

In [7]:
if new_embeddings:
    # Remove HTML tags
    tqdm.pandas()
    h = HTML2Text()
    h.ignore_links = True
    data['cleaned'] = data['text'].progress_apply(lambda x: h.handle(x))
    print( "Removed HTML Tags.")

    # Remove links
    http_link_pattern = r'http\S+'
    bitly_link_pattern = r'bit.ly/\S+'
    data['cleaned'] = data['cleaned'].str.replace(http_link_pattern, '')
    data['cleaned'] = data['cleaned'].str.replace(bitly_link_pattern, '')
    print( "Removed Links.")

    # Remove user names
    keep_names = ["earth", "Tide", "Geologist", "A Person", "Titanic", "adventure", "Sun", "The United States Of America"] # user names we want to keep
    user_names = [name for name in data['authorName'].unique() if (len(name)> 3 and name not in keep_names)]
    data['cleaned'] = data['cleaned'].str.replace('|'.join(map(re.escape, user_names)), '')
    print( "Removed user names.")

100%|██████████| 1124/1124 [00:00<00:00, 5626.32it/s]
Removed HTML Tags.
Removed Links.
Removed user names.


# Save or Load preprocessed data

Save your data afte preprocessing, or load preprocessed data from disc.

In [9]:
if new_embeddings:
  data.to_pickle(output_path+'data_preprocessed'+'.pkl')
else:
  data = pd.read_pickle(output_path+'data_preprocessed'+'.pkl')
data[:50]

Unnamed: 0,publishedAt,authorName,text,isReply,cleaned
0,2020-10-29 15:24:30,Fa Ri,Bei Minute 12:04 am besten Vorspulen (dankt mir später),0,Bei Minute 12:04 am besten Vorspulen (dankt mir später)\n\n
1,2020-10-29 15:54:16,Fa Ri,Und bei 40:55,1,Und bei 40:55\n\n
2,2020-06-24 19:56:24,Thomas Hoffmann,Unerträglicher Müll..,0,Unerträglicher Müll..\n\n
3,2020-04-27 16:50:46,Samuel Johanns,"&quot;Der Wettbewerb um die besten Ideen, der ist jetzt eröffnet.&quot;<br /><br />Besser kann man &quot;Wir haben wirklich absolut gar keinen Plan zu Lösung&quot; doch gar nicht euphemisieren. 😂",0,"""Der Wettbewerb um die besten Ideen, der ist jetzt eröffnet."" \n \nBesser kann man ""Wir haben wirklich absolut gar keinen Plan zu Lösung"" doch\ngar nicht euphemisieren. 😂\n\n"
4,2020-03-28 14:57:43,Aaron Lewandowski,Animechristian,0,Animechristian\n\n
5,2020-03-28 14:58:05,Aaron Lewandowski,Wow wie schlau du bist,1,Wow wie schlau du bist\n\n
6,2020-03-01 14:26:22,Emme Weckerle,Die Bürger müssen endlich mehr dagegen tun,0,Die Bürger müssen endlich mehr dagegen tun\n\n
7,2020-05-18 21:38:32,Noah Lawrence,Ja aber die Politik muss die weichen stellen,1,Ja aber die muss die weichen stellen\n\n
8,2020-02-28 05:51:18,Hans-Dieter Brune,"Ich bin für die Abschaffung des öffentlich-rechtlichen Rundfunks, weil der öffentlich-rechtliche Rundfunk ein ganz großer Umweltsünder ist. Denn der öffentlich-rechtliche Rundfunk arbeitet nur mit Strom und pustet jede Menge CO2 in die Atmosphäre. Man hätte mehrere Vorteile:<br /><br />1. Man würde jede Menge CO2 sparen und der Plan der Klimaziele würde übererfüllt.<br /><br />2. Man hätte der Bevölkerung den Zwangsbeitrag erspart und würde damit die Ausbeutung der Bevölkerung etwas verringern.",0,"Ich bin für die Abschaffung des öffentlich-rechtlichen Rundfunks, weil der\nöffentlich-rechtliche Rundfunk ein ganz großer Umweltsünder ist. Denn der\nöffentlich-rechtliche Rundfunk arbeitet nur mit Strom und pustet jede Menge\nCO2 in die Atmosphäre. Man hätte mehrere Vorteile: \n \n1\. Man würde jede Menge CO2 sparen und der Plan der Klimaziele würde\nübererfüllt. \n \n2\. Man hätte der Bevölkerung den Zwangsbeitrag erspart und würde damit die\nAusbeutung der Bevölkerung etwas verringern.\n\n"
9,2020-02-26 13:41:09,Emily L.U,Im Unterricht ist Handyverbot aber im Bundestag nicht oder was ... kann ja nicht wahr sein 📵,0,Im Unterricht ist Handyverbot aber im Bundestag nicht oder was ... kann ja\nnicht wahr sein 📵\n\n


# Produce Text Embeddings with Universal Sentence Encoder

## Load Model
Load the model from TF-hub

In [11]:
hub_url = url_dict[model_type]
if new_embeddings:
  print("Loading model. This will take some time...")
  embed = hub.load(hub_url)

Loading model. This will take some time...


## Embed Documents
Produce embeddings of your documents.

In [None]:
if new_embeddings:
    for k,g in data.groupby(np.arange(len(data))//200):
        if k == 0:
            embeddings = embed(g['cleaned'])
        else:
            embeddings_new = embed(g['cleaned'])
            embeddings = tf.concat(values=[embeddings,embeddings_new],axis = 0)
            print(k , end =" ")
    print("The embeddings vector is of fixed length {}".format(embeddings.shape[1]))

    np.save(output_path+'/embeddings'+model_type+'.npy', embeddings, allow_pickle=True, fix_imports=True)
else:
    embeddings = np.load(output_path+'/embeddings'+model_type+'.npy', mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')

embeddings.shape

## Calculate Similarity Matrix with angular distance

'Following Cer et al. (2018), we first compute
the sentence embeddings u, v for an STS sentence
pair, and then score the sentence pair similarity
based on the angular distance between the two
embedding vectors  d = − arccos (uv/||u|| ||v||).'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def cos_sim(input_vectors):
    similarity = cosine_similarity(input_vectors)
    return similarity
cosine_similarity_matrix = cos_sim(np.array(embeddings))
print(cosine_similarity_matrix)

# Plots Similarity 
Plot and print a heat map showing the semantic contextual similarity between comments.

In [None]:
import seaborn as sns
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

num_samples = 5
off_set = 10000

plot_similarity(data.iloc[off_set:off_set+num_samples]['cleaned'], embeddings[off_set:off_set+num_samples], 90)

# Show neighbours of a comment 
Define which comment to analyze

In [None]:
comment_index = 324
comment = data["cleaned"][comment_index]
comment_list = data["cleaned"].tolist()
print(comment)

Print similar comments.

In [None]:
def get_top_similar(sentence, sentence_list, similarity_matrix, topN):
    # find the index of sentence in list
    index = sentence_list.index(sentence)
    # get the corresponding row in similarity matrix
    similarity_row = np.array(similarity_matrix[index, :])
    # get the indices of top similar
    indices = similarity_row.argsort()[-topN:][::-1]
    return [sentence_list[i] for i in indices]


for i, value in enumerate(get_top_similar(comment, comment_list, cosine_similarity_matrix, 20)):
  print("Top similar comment {}: {}".format(i+1, value))