In [12]:
from k11.models.no_sql_models import SourceMap, Format
from k11.models import LinkStore

from k11.vault import connection_handler

In [3]:
import numpy as np 
import pandas as pd 
%matplotlib inline 
import matplotlib.pyplot as plt 

In [4]:
def tag_formatter(s: str) ->str:
    return s.replace("/", ".")
    

In [5]:
def assumed_tag_formatter(s) -> list:
    if isinstance(s, list):
        s = list(filter(lambda x: x is not None and len(x) > 0, s))
        s = " ".join(s)
    s = s.strip()
    return list(filter(lambda x: len(x) > 0, set(tag_formatter(s).split(" "))))

In [6]:
a = assumed_tag_formatter([" art__entertainment.humor", "art__entertainment.celebrity_fan__gossip.bollywood art__entertainment.movies"])
b = assumed_tag_formatter([None, "as"])

In [7]:
print(a,b)

['art__entertainment.movies', 'art__entertainment.humor', 'art__entertainment.celebrity_fan__gossip.bollywood'] ['as']


In [9]:
def insert_link(array: list, source_map: SourceMap, link_store: LinkStore):
    # print(assumed_tag_formatter([source_map.assumed_tags, link_store.assumed_tags]))
    tags = assumed_tag_formatter([source_map.assumed_tags, link_store.assumed_tags])
    array.append([source_map.source_id, source_map.source_name, 
    tags,
    link_store.content_type, link_store.is_multiple, link_store.link   
    ])
    return array

In [23]:
def create_frame():
    connection_handler.mount_mongo_engines()
    source_maps: SourceMap = SourceMap.objects(is_third_party= False)
    #[ source_id, source_name, assumed_tags, content_type, is_multiple, link, ]
    array = []
    for source_map in source_maps:
        for link_store in source_map.links:
            array = insert_link(array, source_map, link_store)
            # print(array)
    connection_handler.disconnect_mongo_engines()
    return pd.DataFrame(array, columns=['Source Id', 'Source Name', 'Assumed Tags', ' Content Type', 'Is Multiple', 'Link'])


In [24]:
df = create_frame()

In [25]:
df.head(10)

Unnamed: 0,Source Id,Source Name,Assumed Tags,Content Type,Is Multiple,Link
0,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,[art__entertainment.humor],article,False,https://www.scoopwhoop.com/category/humor/
1,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, art__entertainment....",article,False,https://www.scoopwhoop.com/category/entertainm...
2,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, travel]",article,False,https://www.scoopwhoop.com/category/travel/
3,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, science.biology.zoo...",article,False,https://www.scoopwhoop.com/category/animals/
4,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[sports, art__entertainment.humor]",article,False,https://www.scoopwhoop.com/category/sports/
5,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, society.relationship]",article,False,https://www.scoopwhoop.com/category/relationsh...
6,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, technology__computi...",article,False,https://www.scoopwhoop.com/category/tech/
7,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[art__entertainment.humor, food__drink.cooking...",article,False,https://www.scoopwhoop.com/category/food/
8,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[society.culture, art__entertainment.humor, so...",article,False,https://www.scoopwhoop.com/category/men/
9,WSTvVDLbD-5nbmqOxtkX3A_scoop_woop,Scoop Woop,"[society.people.womens, art__entertainment.hum...",article,False,https://www.scoopwhoop.com/category/men/


In [26]:
def insert_tag(counter: dict, tag:str, substr: bool = True):
    if not substr:
        if tag not in counter:
            counter[tag] = 0
        counter[tag] += 1
        return counter
    all_tags = tag.split(".")
    for index in range(len(all_tags)):
        t = ".".join(all_tags[:index+1])
        if t not in counter:
            counter[t] = 0
        counter[t] += 1
    return counter

In [27]:
def tags_count(substr=True):
    tag_couter = {}
    for tag_list in df["Assumed Tags"].values:
        for tag in tag_list:
            tag_couter = insert_tag(tag_couter, tag, substr=substr)
    return tag_couter


In [28]:
tag_counter = tags_count()

In [29]:
tag_counter_dirty = tags_count(substr=False)

In [30]:
sorted_counter = sorted(tag_counter_dirty, key= lambda x: tag_counter_dirty[x])
print("\n".join([f'{key} : {tag_counter_dirty[key]}' for key in sorted_counter if tag_counter_dirty[key] < 10]))

society.people.womens : 1
technology__computing.consumer_electronics.games__console : 1
technology__computing.hardware.computer : 1
technology__computing.consumer_electronics : 1
technology__computing.internet_technology : 1
technology__computing.operating_systems : 1
technology__computing.gaming : 1
technology__computing.hardware : 1
society.welfare.social_service : 1
art__entertainment.celebrity_fan__gossip : 1
style__fashion.clothing : 1
science.enviroment : 1
style__fashion.fashion_designers : 1
style__fashion.jewelery : 1
society.life.inspiraion : 1
society.life.happiness : 1
style__fashion.celebs : 1
science.physics : 1
business__industrial.agriculture__forestry : 1
science.biology.botany : 1
technology__computing.consumer_electronics.game_systems__consoles : 1
hobbies__interests.games.video__computer_games : 1
business__industrial.aerospace__defence.space_tehnology : 1
science.physics.atomic_physics : 1
automotive__vehicles : 1
business__industrial.business_news : 1
finance : 1


In [31]:
threshold = 6
filtered = [key for key,value in tag_counter_dirty.items() if value >= threshold]

In [32]:
filtered

['art__entertainment.humor',
 'art__entertainment.celebrity_fan__gossip.bollywood',
 'art__entertainment.movies__tv.movies__series_trailers',
 'travel',
 'society.relationship',
 'technology__computing.tech_news',
 'food__drink.cooking',
 'food__drink.food',
 'society.people.mens',
 'hobbies__interests.arts__crafts.photography',
 'technology__computing.consumer_electronics.camera__photo_equipments',
 'society.life',
 'art__entertainment.movies__tv',
 'technology__computing',
 'art__entertainment.movies__tv.reviews__interviews',
 'hobbies__interests.games',
 'misc.facts',
 'science.physics.space__astronomy',
 'science',
 'misc.weird',
 'hobbies__interests.paranormal',
 'misc.anomaly',
 'misc.mystery',
 'health__fitness',
 'science.social_science.history',
 'automotive__vehicle.cars.luxury_cars',
 'style__fashion.womens',
 'travel.adventure',
 'society.lifestyle',
 'health__fitness.disorders',
 'health__fitness.mental_health',
 'food__drink.healthy_eating',
 'society.people',
 'science.e

In [33]:
def plot_map(counter: dict):
    names = list(counter.keys())
    values = list(counter.values())
    fig = plt.figure(figsize=(100,100))
    plt.barh(names, values, height=0.8)
    for index, value in enumerate(values):
        plt.text(value, index, str(value))
    plt.show()

In [36]:
len(tag_counter_dirty)

101

In [1]:
import pickle

In [39]:
with open("tag_counter.bin", "wb") as file:
    pickle.dump({
        "tag_counter": tag_counter,
        "tag_counter_dirty": tag_counter_dirty
    }, file)

In [2]:
obj = pickle.load(open("tag_counter.bin", "rb"))

In [5]:
dirty = obj['tag_counter_dirty']

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(dirty.keys(), columns=['categories'])

In [9]:
df.head()

Unnamed: 0,categories
0,art__entertainment.humor
1,art__entertainment.celebrity_fan__gossip.bolly...
2,art__entertainment.movies__tv.movies__series_t...
3,travel
4,science.biology.zoology


In [12]:
df.to_csv('categories.csv')