In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
trends = json.load(open("../trends-data/processed_trends.json"))

In [4]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [5]:
trends[list(trends.keys())[0]]

['Days until SpiderManAcrossTheSpiderVerse releases ',
 'More explosive 2D FX I animated from the recent SpiderManAcrossTheSpiderVerse trailers Out very soon Spiderman2099 2DFX ',
 'SonyPicturesUK So looking forward to this SpiderManAcrossTheSpiderVerse spiderverse',
 'SpiderMan Across the SpiderVerse Trailer  SpidermanAcrossTheSpiderverse Spiderman Marvel Sony Trailer',
 'When you have enough points to get the the Future Sense colllectors box for free Just pay shipping GFUEL SpiderMan SpiderManAcrossTheSpiderVerse GFuelEnergy GammaLabs ',
 'SpiderManAcrossTheSpiderVerse will end on a great cliffhanger I was very satisfied after The Empire Strikes Back And hopefully This is our Empire Joaquim Dos Santos CoDirector ',
 'deleted trailer shows new look at Ben Reilly aka the Scarlet Spider AcrossTheSpiderVerse SpiderManAcrossTheSpiderVerse SpiderVerse leak scarletspider benreilly itsv atsv intothespiderverse MilesMorales 3CFilmss SpiderLeaks ',
 'PENI PARKER APPEAR IN ACROSS THE SPIDERVERS

In [13]:
treds_entities = []
for trend_topic in list(trends.keys()):
    for trend in trends[trend_topic]:
        entities = nlp(trend)
        for ent in entities:
            treds_entities.append({"trend":trend_topic, "word":ent["word"], "entity":ent["entity"]})

In [25]:
df = pd.DataFrame(treds_entities)#.groupby("trend")["word"].apply(set)
df_for_pinterest = df[((df.entity == "I-PER") | (df.entity == "B-PER")) & ~(df.word.astype(str).str.startswith('#'))]
# df_for_pinterest["pinterest"] = df_for_pinterest.word + " " + df_for_pinterest.trend
df_for_pinterest = df_for_pinterest.groupby(['trend'])['word'].apply(lambda x: x.value_counts().index[0]).reset_index()

In [26]:
def drop_hashtag(x):
    if x[0] == "#":
        return x[1:]
    else: return x

df_for_pinterest["trend_without_hashtag"] = df_for_pinterest.apply(lambda x: drop_hashtag(x.trend), axis=1)

In [29]:
df_for_pinterest["pinterest_search"] = df_for_pinterest["word"] + " " + df_for_pinterest["trend_without_hashtag"]
df_for_pinterest

Unnamed: 0,trend,word,trend_without_hashtag,pinterest_search
0,#911LoneStar,Ta,911LoneStar,Ta 911LoneStar
1,#911onFOX,Buck,911onFOX,Buck 911onFOX
2,#AdamKutnerPowerPlay,Adam,AdamKutnerPowerPlay,Adam AdamKutnerPowerPlay
3,#AgustD_SUGA_Tour_in_LA,A,AgustD_SUGA_Tour_in_LA,A AgustD_SUGA_Tour_in_LA
4,#AllForCITY,Matt,AllForCITY,Matt AllForCITY
...,...,...,...,...
550,hanbin,De,hanbin,De hanbin
551,jack antonoff,Matt,jack antonoff,Matt jack antonoff
552,lorde,Sasha,lorde,Sasha lorde
553,luke hemmings,Luke,luke hemmings,Luke luke hemmings
