In [1]:
from k11.models.main import SourceMap, LinkStore

In [2]:
import numpy as np 
import pandas as pd 
%matplotlib inline 
import matplotlib.pyplot as plt 

In [3]:
def tag_formatter(s: str) ->str:
    return s.replace("/", ".")
    

In [4]:
def assumed_tag_formatter(s) -> list:
    if isinstance(s, list):
        s = list(filter(lambda x: x is not None and len(x) > 0, s))
        s = " ".join(s)
    s = s.strip()
    return list(filter(lambda x: len(x) > 0, set(tag_formatter(s).split(" "))))

In [5]:
a = assumed_tag_formatter([" art__entertainment.humor", "art__entertainment.celebrity_fan__gossip.bollywood art__entertainment.movies"])
b = assumed_tag_formatter([None, "as"])

In [6]:
print(a,b)

['art__entertainment.humor', 'art__entertainment.movies', 'art__entertainment.celebrity_fan__gossip.bollywood'] ['as']


In [7]:
def insert_link(array: list, source_map: SourceMap, link_store: LinkStore):
    # print(assumed_tag_formatter([source_map.assumed_tags, link_store.assumed_tags]))
    tags = assumed_tag_formatter([source_map.assumed_tags, link_store.assumed_tags])
    array.append([source_map.source_id, source_map.source_name, 
    tags,
    link_store.content_type, link_store.is_multiple, link_store.link   
    ])
    return array

In [8]:
def create_frame():
    source_maps: SourceMap = SourceMap.adapter().find({"is_third_party": False})
    #[ source_id, source_name, assumed_tags, content_type, is_multiple, link, ]
    array = []
    for source_map in source_maps:
        for link_store in source_map.links:
            array = insert_link(array, source_map, link_store)
            # print(array)
    return pd.DataFrame(array, columns=['Source Id', 'Source Name', 'Assumed Tags', ' Content Type', 'Is Multiple', 'Link'])


In [9]:
df = create_frame()

In [10]:
df.head(10)

Unnamed: 0,Source Id,Source Name,Assumed Tags,Content Type,Is Multiple,Link
0,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,[art__entertainment.humor],article,False,https://www.scoopwhoop.com/category/humor/
1,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, art__entertainment....",article,False,https://www.scoopwhoop.com/category/entertainm...
2,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, travel]",article,False,https://www.scoopwhoop.com/category/travel/
3,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, science.biology.zoo...",article,False,https://www.scoopwhoop.com/category/animals/
4,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, sports]",article,False,https://www.scoopwhoop.com/category/sports/
5,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, society.relationship]",article,False,https://www.scoopwhoop.com/category/relationsh...
6,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, technology__computi...",article,False,https://www.scoopwhoop.com/category/tech/
7,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, food__drink.food, f...",article,False,https://www.scoopwhoop.com/category/food/
8,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, society.people.mens...",article,False,https://www.scoopwhoop.com/category/men/
9,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, society.people.wome...",article,False,https://www.scoopwhoop.com/category/men/


In [11]:
def insert_tag(counter: dict, tag:str, substr: bool = True):
    if not substr:
        if tag not in counter:
            counter[tag] = 0
        counter[tag] += 1
        return counter
    all_tags = tag.split(".")
    for index in range(len(all_tags)):
        t = ".".join(all_tags[:index+1])
        if t not in counter:
            counter[t] = 0
        counter[t] += 1
    return counter

In [12]:
tags = "technology__computing.consumer_electronics.smartphones.roid".split(".")

In [13]:
tags[0:1]

['technology__computing']

In [14]:
for index in range(len(tags)):
    t = ".".join(tags[:index+1])
    print(t)

technology__computing
technology__computing.consumer_electronics
technology__computing.consumer_electronics.smartphones
technology__computing.consumer_electronics.smartphones.roid


In [15]:
def tags_count(substr=True):
    tag_couter = {}
    for tag_list in df["Assumed Tags"].values:
        for tag in tag_list:
            tag_couter = insert_tag(tag_couter, tag, substr=substr)
    return tag_couter


In [16]:
tag_counter = tags_count()

In [17]:
tag_counter_dirty = tags_count(substr=False)

In [18]:
tag_counter

{'art__entertainment': 61,
 'art__entertainment.humor': 12,
 'art__entertainment.movies__tv': 20,
 'art__entertainment.movies__tv.movies__series_trailers': 1,
 'art__entertainment.celebrity_fan__gossip': 23,
 'art__entertainment.celebrity_fan__gossip.bollywood': 21,
 'travel': 19,
 'science': 51,
 'science.biology': 4,
 'science.biology.zoology': 3,
 'sports': 2,
 'society': 65,
 'society.relationship': 7,
 'technology__computing': 46,
 'technology__computing.tech_news': 15,
 'food__drink': 14,
 'food__drink.food': 3,
 'food__drink.cooking': 1,
 'society.people': 8,
 'society.people.mens': 1,
 'society.culture': 5,
 'society.people.womens': 1,
 'misc': 78,
 'technology__computing.consumer_electronics': 15,
 'technology__computing.consumer_electronics.smartphones': 2,
 'technology__computing.consumer_electronics.smartphones.android': 2,
 'technology__computing.operating_systems': 3,
 'technology__computing.operating_systems.android': 2,
 'technology__computing.hardware': 2,
 'technology

In [19]:
def plot_map(counter: dict):
    names = list(counter.keys())
    values = list(counter.values())
    fig = plt.figure(figsize=(100,100))
    plt.barh(names, values, height=0.8)
    for index, value in enumerate(values):
        plt.text(value, index, str(value))
    plt.show()

In [20]:
tag = "automotive__vehicle.cars.luxury_cars"
for value in df.values:
    if  tag in value[2]:
        print(value)

['wkfG9nK7n7HdZ4ALSghb_Q_luxury lifestyle mag' 'Luxury Lifestyle Mag'
 list(['automotive__vehicle.cars.luxury_cars']) 'article' False
 'https://www.luxurylifestylemag.co.uk/tag/luxury-cars/feed/']
['j2wVILgTZ6O5bXwmDWPMAQ_pinterest' 'Pinterest'
 list(['automotive__vehicle.cars.luxury_cars']) 'article' True
 'https://in.pinterest.com/noorhazin/luxury-cars.rss']
['j2wVILgTZ6O5bXwmDWPMAQ_pinterest' 'Pinterest'
 list(['automotive__vehicle.cars.luxury_cars', 'style__fashion.womens', 'style__fashion.fashion_designers'])
 'article' True 'https://in.pinterest.com/gmunglani/indian-fashion.']
['j2wVILgTZ6O5bXwmDWPMAQ_pinterest' 'Pinterest'
 list(['automotive__vehicle.cars.luxury_cars', 'style__fashion.jewelery', 'style__fashion.womens'])
 'article' True 'https://in.pinterest.com/gmunglani/jewellery.rss']
['j2wVILgTZ6O5bXwmDWPMAQ_pinterest' 'Pinterest'
 list(['automotive__vehicles.cars.luxury_cars', 'automotive__vehicle.cars.luxury_cars'])
 'image' True 'https://in.pinterest.com/VardaiDesign/supe