In [4]:
import pandas as pd
import praw
import json

reddit_post_mapping = {
    'CharacterAI': ['CharacterAI', 'CharacterAi_NSFW'],
    'HuggingChat': ['HuggingChat'],
    'OpenAI': ['GPTStore', 'GPTStoreFR', 'ChatGPTStore'],
    'Poe': ['Poe_AI', 'PoeAI_NSFW'],
}

with open('.reddit.json') as f:
    config = json.load(f)
    reddit = praw.Reddit(**config)
    for forum in reddit_post_mapping:
        df = pd.DataFrame()
        for subreddit in reddit_post_mapping[forum]:
            posts = []
            for submission in reddit.subreddit(subreddit).hot(limit=None):
                posts.append({
                    'Title': submission.title,
                    'Question': submission.selftext,
                    'subreddit': submission.subreddit.display_name,
                    'Link': "https://www.reddit.com" + submission.permalink,
                    'Creation Time': submission.created,
                    'Reply Count': submission.num_comments,
                    'Upvote Ratio': submission.upvote_ratio
                })
            df = pd.concat([df, pd.DataFrame(posts)], ignore_index=True)
        df.to_json(f'{forum}_reddit.json', orient='records', indent=4)

In [2]:
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
import numpy as np
import requests

tag_store_links = []

driver = webdriver.Chrome()
driver.implicitly_wait(5)

url = 'https://community.openai.com/tags'
driver.get(url)

for tag in driver.find_elements(By.XPATH, '//div[@class="tag-box"]/a'):
    tag_name = tag.get_attribute('data-tag-name')
    if 'store' in tag_name.lower():
        tag_store_links.append(tag.get_attribute('href'))
        
post_store_links = set()

for tag_link in tag_store_links:
    driver.get(tag_link)
    for post in driver.find_elements(By.XPATH, '//tbody[@class="topic-list-body"]/tr'):
        post_link = post.find_element(By.XPATH, './/a[@role="heading"]').get_attribute('href')
        post_store_links.add(post_link)
        
posts = pd.DataFrame()
for post_link in post_store_links:
    json_data = requests.get(post_link + '.json').json()
    post = {}
    post['Title'] = json_data['title']
    post['Creation Time'] = json_data['created_at']
    post['View Count'] = json_data['views']
    post['Reply Count'] = json_data['reply_count']
    post['Like Count'] = json_data['like_count']
    post['Link'] = post_link
    post['Tags'] = json_data['tags']
    post['Question'] = json_data['post_stream']['posts'][0]['cooked']
    accpeted_answer = np.nan
    for reply in json_data['post_stream']['posts'][1:]:
        if reply['accepted_answer']:
            accpeted_answer = reply['cooked']
            break
    post['Accepted Answer'] = accpeted_answer
    post = pd.DataFrame([post])
    posts = pd.concat([posts, post], ignore_index=True)
posts.to_json('OpenAI_official_website.json', orient='records', indent=4)

driver.quit()


In [9]:
from huggingface_hub import HfApi
import pandas as pd

space_info_list = []
for space in HfApi().list_spaces():
    if space.private:
        continue
    space_info = {
        'id': space.id,
        '#like': space.likes,
        'creation date': space.created_at,
    }
    space_info_list.append(space_info)
pd.DataFrame(space_info_list).to_json('HuggingFaceSpaces.json', orient='records', indent=4)

In [7]:
from huggingface_hub import HfApi
import pandas as pd

api = HfApi()
df = pd.read_json('HuggingFaceSpaces.json')

df_filter = pd.DataFrame()
for index, row in df.iterrows():
    if index % 100:
        df_filter.to_json('HuggingFaceSpaces_filtered.json', orient='records', indent=4)
    space_id = row['id']
    if not row['#like']:
        continue
    try:
        if not api.get_space_runtime(space_id).stage in ['RUNNING', 'SLEEPING']:
            continue
        if not len(list(api.get_repo_discussions(repo_id=space_id, repo_type="space"))):
            continue
    except Exception as e:
        print(e)
        continue
    df_filter = pd.concat([df_filter, pd.DataFrame([row])], ignore_index=True)
    print(index, space_id)

df_filter = pd.DataFrame(df_filter)
df_filter.to_json('HuggingFaceSpaces_filtered.json', orient='records', indent=4)

5 52Hz/HWMNet_lowlight_enhancement
79 Bagus/speaker-verification-demo
129 Cropinky/gpt2-rap-songs
151 Detomo/Japanese_OCR
152 Detomo/Lighten_dark_image
 (Request ID: Root=1-66450cd3-130a4dcd5ef07d6b7fab8075;6440cb93-409e-4196-bd1a-1412642e5e66)

403 Forbidden: Discussions are disabled for this repo.
Cannot access content at: https://huggingface.co/api/spaces/Flux9665/IMS-Toucan/discussions?p=0.
If you are trying to create or update content,make sure you have a token with the `write` role.
215 Harveenchadha/hindi-speech-recognition-vakyansh-wav2vec2
218 Hellisotherpeople/HF-BERTopic
220 Hellisotherpeople/HF-SHAP
223 Hellisotherpeople/Unsupervised_Extractive_Summarization
228 Huertas97/Inpaint_Me
245 Jacobo/syntax
340 NeuML/articlesummary
415 Sa-m/manifesto-explainer
469 ThePixOne/open_domain_qa
478 ThomasSimonini/SnowballFight
627 akhaliq/AnimeGANv2
629 akhaliq/ArcaneGAN
645 akhaliq/GFPGAN
646 akhaliq/GPEN
661 akhaliq/Real-ESRGAN
809 aubmindlab/Arabic-NLP
849 bluebalam/paper-rec
893 chi

In [37]:
from huggingface_hub import get_repo_discussions, get_discussion_details
import pandas as pd

df = pd.DataFrame()
with open('valid_repos_HF.txt') as f:
    repos = f.read().splitlines()
    for repo in repos:
        repo = repo.split('https://huggingface.co/spaces/')[1]
        try:
            discussions = get_repo_discussions(repo_id=repo, repo_type="space", discussion_type="discussion")
            for discussion in discussions:
                info = {
                    'Title': discussion.title,
                    'Created At': discussion.created_at
                }
                discussion_details = get_discussion_details(repo_id=repo, repo_type="space", discussion_num=discussion.num)
                for index, event in enumerate(discussion_details.events):
                    if not index and event.type == 'comment':
                        info['Body'] = event.content
                    if event.type == 'status-change' and event.new_status == 'closed':
                        info['Closed At'] = event.created_at
                        break
                df = pd.concat([df, pd.DataFrame([info])], ignore_index=True)
                df.to_json(f'HuggingFace_discussions.jsonl', orient='records', lines=True)
        except:
            print(repo)

Bagus/speaker-verification-demo
Harveenchadha/hindi-speech-recognition-vakyansh-wav2vec2
aubmindlab/Arabic-NLP
docs-demos/t5-base
echolee/faceanime4u
flax-community/dalle-mini
gorkemgoknar/moviechatbot
kaushalya/medclip-roco
merve/GPT-2-story-gen
nielsr/vilt-vqa
obi/Medical-Note-Deidentification
prithivida/Gramformer
pyannote/pretrained-pipelines
ronvolutional/ai-pokemon-card
sunwaee/MT5-Questions-Answers-Generation-Extraction
un-index/textgen6b
valhalla/glide-text2im
SebastianEnger/AI-TextGenerator
microsoft/document-image-transformer
sarulab-speech/UTMOS-demo
huggingface/hf-speech-bench
smajumdar/nemo_conformer_rnnt_large
abidlabs/streaming-asr
abidlabs/streaming-asr-paused
givkashi/SwinIR-Super-resolution
awacke1/Image-to-Multilingual-OCR
multimodalart/latentdiffusion
dalle-mini/dalle-mini
yangheng/PyABSA-APC
aware-ai/german-asr
CVMX-jaca-tonos/Identificar-lenguas-y-frases
nickmuchi/semantic-search-with-retrieve-and-rerank
radames/edit-video-by-editing-text
bigscience/bloom-book
Gra

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-665818c5-4bfc676d7842f935151d4a43;d14c743b-b0c7-48d5-9145-81c62344d673)

Repository Not Found for url: https://huggingface.co/api/spaces/Vishwas1/BloomDemo2/discussions?type=discussion&p=0.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [38]:
from huggingface_hub import get_repo_discussions, get_discussion_details
import pandas as pd

temp_lock = True

df = pd.read_json('HuggingFace_discussions.jsonl', lines=True)
with open('valid_repos_HF.txt') as f:
    repos = f.read().splitlines()
    for repo in repos:
        repo = repo.split('https://huggingface.co/spaces/')[1]
        if repo != 'Vishwas1/BloomDemo2' and temp_lock:
            continue
        else:
            temp_lock = False
        try:
            discussions = get_repo_discussions(repo_id=repo, repo_type="space", discussion_type="discussion")
            for discussion in discussions:
                info = {
                    'Title': discussion.title,
                    'Created At': discussion.created_at
                }
                discussion_details = get_discussion_details(repo_id=repo, repo_type="space", discussion_num=discussion.num)
                for index, event in enumerate(discussion_details.events):
                    if not index and event.type == 'comment':
                        info['Body'] = event.content
                    if event.type == 'status-change' and event.new_status == 'closed':
                        info['Closed At'] = event.created_at
                        break
                df = pd.concat([df, pd.DataFrame([info])], ignore_index=True)
                df.to_json(f'HuggingFace_discussions.jsonl', orient='records', lines=True)
        except:
            print(repo)

Vishwas1/BloomDemo2
Manjushri/Manju-Dream-Booth-A10G
DEEMOSTECH/3D-Avatar-Generator
openbmb/viscpm-chat
Writer/instruct-palmyra-20b
Ami001/Merkava
FuseAI/FuseChat-7B
KingNish/WizardLM-2-7B
heisenberg3376/cascaded-speech-to-speech-translation


In [47]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

df = pd.read_json('HuggingFace_discussions.jsonl', lines=True)

total_tokens = 0

for index, row in df.iterrows():
    text = row['Title'] + row['Body']
    tokens = tokenizer.encode(text, disallowed_special=())
    token_count = len(tokens)
    total_tokens += token_count

print(f'Averge number of tokens: {total_tokens / len(df)}')

Averge number of tokens: 204.06081320812734
