In [11]:
conda list

# packages in environment at C:\Users\neema\anaconda3:
#
# Name                    Version                   Build  Channel
_ipyw_jlab_nb_ext_conf    0.1.0            py39haa95532_0  
absl-py                   1.0.0                    pypi_0    pypi
alabaster                 0.7.12             pyhd3eb1b0_0  
anaconda                  2021.11                  py39_0  
anaconda-client           1.9.0            py39haa95532_0  
anaconda-navigator        2.1.1                    py39_0  
anaconda-project          0.10.1             pyhd3eb1b0_0  
anyio                     2.2.0            py39haa95532_2  
appdirs                   1.4.4              pyhd3eb1b0_0  
argh                      0.26.2           py39haa95532_0  
argon2-cffi               20.1.0           py39h2bbff1b_1  
arrow                     0.13.1           py39haa95532_0  
asn1crypto                1.4.0                      py_0  
astroid                   2.6.6            py39haa95532_0  
astropy                   4.3.

mypy_extensions           0.4.3            py39haa95532_0  
navigator-updater         0.2.1            py39haa95532_0  
nbclassic                 0.2.6              pyhd3eb1b0_0  
nbclient                  0.5.3              pyhd3eb1b0_0  
nbconvert                 6.1.0            py39haa95532_0  
nbformat                  5.1.3              pyhd3eb1b0_0  
nest-asyncio              1.5.1              pyhd3eb1b0_0  
networkx                  2.6.3              pyhd3eb1b0_0  
nltk                      3.6.5              pyhd3eb1b0_0  
nose                      1.3.7           pyhd3eb1b0_1006  
notebook                  6.4.5            py39haa95532_0  
numba                     0.54.1           py39hf11a4ad_0  
numexpr                   2.7.3            py39hb80d3ca_1  
numpy                     1.20.3           py39ha4e8547_0  
numpy-base                1.20.3           py39hc2deb75_0  
numpydoc                  1.1.0              pyhd3eb1b0_1  
oauthlib                  3.2.0         

In [36]:
pip install tensorflow_hub

Collecting tensorflow_hubNote: you may need to restart the kernel to use updated packages.
  Using cached tensorflow_hub-0.12.0-py2.py3-none-any.whl (108 kB)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.12.0



In [37]:
import itertools
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_hub as hub
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler

In [15]:
def split_text_into_sentences(text):
    """Splits a given text into multiple sentences using RegEx"""
    return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)

In [16]:
def get_average_similarity(model, ps1, ps2):
    """Calculates the average of the similarity score based on all the combinations of these two lists of texts.
    This function uses Sentence Transformers model."""
    sm_scores = []  # list to store similarity scores for each combination of post texts.
    try:
        sembs_1 = model.encode(ps1)  # Sentence embeddings for the first list of posts
        sembs_2 = model.encode(ps2)  # Sentence embeddings for the second list of posts

        # Iterating through each of the first list of embeddings and finding cosine similarity with the second list of embeddings
        for semb in sembs_1:
            sm_scores.extend(cosine_similarity([semb], sembs_2))
    except Exception as e:
        print(e)
        print(ps1)
        print(ps2)
    # Taking the average of the scores. Can take median as well.
    return np.mean(sm_scores)

In [17]:
def get_median_similarity(model, ps1, ps2):
    """Calculates the average of the similarity score based on all the combinations of these two lists of texts.
    Using the Universal sentence encoder model."""
    sm_scores = []
    try:
        sembs_1 = model(ps1).numpy()  # Sentence embeddings for the first list of posts
        sembs_2 = model(ps2).numpy()  # Sentence embeddings for the second list of posts

        for semb in sembs_1:
            sm_scores.extend(cosine_similarity([semb], sembs_2))
    except Exception as e:
        print(e)
        # print(ps1)
        # print(ps2)
    return np.median(sm_scores)


In [18]:
def get_inverse_distance_similarity(model, ps1, ps2):
    """Find the similarity by calculating the overlap between the clusters of both the embeddings.
    Using the Universal sentence encoder model."""
    inverse_distance = 0
    try:
        sembs_1 = model(ps1).numpy()  # Sentence embeddings for the first list of posts
        sembs_2 = model(ps2).numpy()  # Sentence embeddings for the second list of posts

        sembs_1_centroid = np.mean(sembs_1, axis=0)
        # Getting the centroid for the first list of embeddings. Assuming that they form a cluster in the n-dim space.
        sembs_2_centroid = np.mean(sembs_2, axis=0)

        inverse_distance = 1 / distance.cdist([sembs_1_centroid], [sembs_2_centroid])[0][0]
    except Exception as e:
        print(e)
        print(ps1)
        print(ps2)
    return inverse_distance

In [19]:
def get_overlap_similarity(model, ps1, ps2):
    """Find the similarity by calculating the overlap between the clusters of both the embeddings.
    This function uses Sentence Transformers model."""
    similarity_score = 0
    try:
        sembs_1 = model.encode(ps1)  # Sentence embeddings for the first list of posts
        sembs_2 = model.encode(ps2)  # Sentence embeddings for the second list of posts

        # Getting the centroid for the first list of embeddings. Assuming that they form a cluster in the n-dim space.
        sembs_1_centroid = np.mean(sembs_1, axis=0)
        sembs_1_radius = np.max(distance.cdist(sembs_1, [sembs_1_centroid]))
        # Taking the max distance among all the points in the first cluster as the radius of the cluster.

        sembs_2_dists = distance.cdist(sembs_2, [sembs_1_centroid])
        # Getting all the distances of the points in the second cluster from the centroid of the first cluster.
        n_A_or_B = len(sembs_1) + len(sembs_2)  # Union of both the clusters.
        n_A_and_B = len(sembs_2_dists[sembs_2_dists <= sembs_1_radius])
        # Intersection of the clusters is calculated by taking the number of points from the second cluster that are present within the radius of the first cluster.
        similarity_score = round(n_A_and_B / n_A_or_B, 4)  # Ratio of intersection over union.
    except Exception as e:
        print(e)
        print(ps1)
        print(ps2)
    return similarity_score

In [20]:
def get_vito_similarity(model, ps1, ps2, vito_number):
    """Find the similarity by calculating the overlap between the clusters of both the embeddings.
    Using the Universal sentence encoder model."""
    similarity_score = 0
    scaler = StandardScaler()
    try:
        sembs_1 = model(ps1).numpy()  # Sentence embeddings for the first list of posts
        sembs_2 = model(ps2).numpy()  # Sentence embeddings for the second list of posts

        sembs_1 = scaler.fit_transform(sembs_1)
        sembs_2 = scaler.fit_transform(sembs_2)

        sembs_1_centroid = np.mean(sembs_1, axis=0)
        sembs_2_centroid = np.mean(sembs_2, axis=0)

        similarity_score = distance.cdist([sembs_1_centroid], [sembs_2_centroid])[0][0] / vito_number
    except Exception as e:
        print(e)
        print(ps1)
        print(ps2)
    return similarity_score

In [21]:
def get_vito_number(model, text_corpus):
    """Find the maximum possible distance in the n-dim hyperspace."""
    sembs = model(text_corpus).numpy()
    # print(sembs.shape)
    scaler = StandardScaler()
    sembs = scaler.fit_transform(sembs)
    # print(np.max(sembs, axis=0) - np.min(sembs, axis=0))
    return np.sqrt(np.sum((np.max(sembs, axis=0) - np.min(sembs, axis=0)) ** 2))

In [38]:
def influencer_influencer_similarity(sentence_split=False):
    """Do the similarity analysis between all the influencers"""
    df_influencers = pd.read_csv('preprocessed_data_inf_without_punc.csv')
    df_influencers.dropna(inplace=True)
    df_influencers = df_influencers[df_influencers['platform'] != 'website']

    # Loading the sentence transformers model.
    # bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    # vito_number = get_vito_number(use_model, df_competitors['text'].tolist())
    # print('Vito number', vito_number)

    # Getting unique values of competitor and influencer names as a list
    infu_list = df_influencers['influencer_username'].unique().tolist()

    final_scores = []
    # Iterating through all the combinations of competitors
    for ns in itertools.combinations(infu_list, 2):
        infu1, infu2 = ns[0], ns[1]
        # Skipping for the same competitor combination
        if infu1 == infu2:
            continue

        # Getting all the posts belonging to the competitor given by 'cmptr1'
        infu1_posts = df_influencers.loc[df_influencers['influencer_username'] == infu1, 'text'].tolist()
        infu2_posts = df_influencers.loc[df_influencers['influencer_username'] == infu2, 'text'].tolist()

        # If sentence split is enabled, will split the posts into sentences.
        if sentence_split:
            stcs = []
            for post in cmptr1_posts:
                stcs.extend(split_text_into_sentences(post))
            infu1_posts = stcs.copy()

            stcs = []
            for post in infu2_posts:
                stcs.extend(split_text_into_sentences(post))
            infu2_posts = stcs.copy()

        print('Finding similarity between the influencers 1', infu1, ' and the influencers 2', infu2, '...')
        # Appending to the list of similarity scores for each combination of competitors
        final_scores.append(
            (infu1, infu2,
             get_median_similarity(model=use_model, ps1=infu1_posts, ps2=infu2_posts)))

        df_similarity = pd.DataFrame(final_scores, columns=['influencer_1', 'influencer_2', 'similarity_score'])
        df_similarity.to_csv('influencer_influencer_similarity.csv', index=False)


# competitor_competitor_similarity(sentence_split=False)
influencer_influencer_similarity()

Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 adamcollard ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 aliceliveing ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 brown.elle ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 charlottedawsy ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 chessieking ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 chloe.khan ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 courtneydblack ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 danosborneofficial ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 gabbydawnallen ...
Finding similarity between the influencers 1 aaroncgshore  and the influencers 2 ini.helen ...
Finding similarit

Finding similarity between the influencers 1 brown.elle  and the influencers 2 charlottedawsy ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 chessieking ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 chloe.khan ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 courtneydblack ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 danosborneofficial ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 gabbydawnallen ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 ini.helen ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 itsalwayshana ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 jamesgshore ...
Finding similarity between the influencers 1 brown.elle  and the influencers 2 jamessmithpt ...
Finding similarity between the inf

Finding similarity between the influencers 1 chloe.khan  and the influencers 2 katiepiper_ ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 korisampson ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 lucymeck1 ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 mac_griffiths_ ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 mattdoesfitness ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 ohpolly ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 oliviadbowen ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 rogersnipes ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 slimmingworld ...
Finding similarity between the influencers 1 chloe.khan  and the influencers 2 sylvijaa ...
Finding similarity between the influencers 1 chlo

Finding similarity between the influencers 1 ini.helen  and the influencers 2 korisampson ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 lucymeck1 ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 mac_griffiths_ ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 mattdoesfitness ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 ohpolly ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 oliviadbowen ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 rogersnipes ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 slimmingworld ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 sylvijaa ...
Finding similarity between the influencers 1 ini.helen  and the influencers 2 thebodycoach ...
Finding similarity between the influencers 1 ini.helen  an

Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 rogersnipes ...
Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 slimmingworld ...
Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 sylvijaa ...
Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 thebodycoach ...
Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 thefitnesschef_ ...
Finding similarity between the influencers 1 jessica_rose_uk  and the influencers 2 _jackfowler_ ...
Finding similarity between the influencers 1 jesswright77  and the influencers 2 katiepiper_ ...
Finding similarity between the influencers 1 jesswright77  and the influencers 2 korisampson ...
Finding similarity between the influencers 1 jesswright77  and the influencers 2 lucymeck1 ...
Finding similarity between the influencers 1 jesswright77  and the influencers 2 mac_griffiths_ ...
Findin

Finding similarity between the influencers 1 rogersnipes  and the influencers 2 thefitnesschef_ ...
Finding similarity between the influencers 1 rogersnipes  and the influencers 2 _jackfowler_ ...
Finding similarity between the influencers 1 slimmingworld  and the influencers 2 sylvijaa ...
Finding similarity between the influencers 1 slimmingworld  and the influencers 2 thebodycoach ...
Finding similarity between the influencers 1 slimmingworld  and the influencers 2 thefitnesschef_ ...
Finding similarity between the influencers 1 slimmingworld  and the influencers 2 _jackfowler_ ...
Finding similarity between the influencers 1 sylvijaa  and the influencers 2 thebodycoach ...
Finding similarity between the influencers 1 sylvijaa  and the influencers 2 thefitnesschef_ ...
Finding similarity between the influencers 1 sylvijaa  and the influencers 2 _jackfowler_ ...
Finding similarity between the influencers 1 thebodycoach  and the influencers 2 thefitnesschef_ ...
Finding similarity bet