In [18]:
import duckdb
import numpy as np
import ollama
import openai
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken

from tqdm import tqdm

In [2]:
con = duckdb.connect(":memory:")

In [3]:
full = con.execute("SELECT conversation_hash, content FROM '../data/wildchat.parquet' WHERE country = 'United States' AND role = 'user';").fetch_df()

In [3]:
firsts = con.execute("SELECT DISTINCT content FROM (SELECT conversation_hash, FIRST(content) AS content FROM '../data/wildchat.parquet' WHERE country = 'United States' AND role = 'user' GROUP BY 1);").fetch_df()

In [40]:
# apply tf-idf to the content
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.5)
X = vectorizer.fit_transform(firsts['content'])

In [47]:
clusterer = KMeans(n_clusters=300)
clusterer.fit(X)

In [48]:
firsts["cluster"] = clusterer.labels_

In [49]:
seed_set = [
    "what is the most recent news you can tell me?",
    "Summarize the latest trends in the renewable energy market for 2023",
    "what is the date and what is th elatest news in the nba",
    "name a few major events in the middle east from 2020 from the bbc",
    "show important news from april 2023",
    "summarize the youth unemployment rate in France from 1975 to 2023"
    """Please list ten websites where I can find bilingual magazines both in Chinese and English for free download. The magazines should be as well-known as “New York Times” and “The Economist”, and the information therein should be based on authoritative and reliable sources."""
]

In [4]:
system_prompt = """You are an AI assistant designed to classify user messages as either news-related (1) or not news-related (0). Your task is to analyze each message and determine if it pertains to news, current events, or information seeking about news sources.

Classification Guidelines:
1. Classify as news-related (1):
   - Messages seeking information about news sources
   - Questions about current events or recent happenings
   - Requests for updates on any topic that could be considered news
   - Inquiries about reputable news outlets or publications
   - Messages mentioning specific news events or asking for news summaries
   - Requests for information that would typically be found in news articles
   - Any message that mentions news, even if it's not the primary focus (err on the side of including ambiguous cases)

2. Classify as not news-related (0):
   - Personal questions or statements unrelated to current events
   - Requests for advice on personal matters
   - Fiction-related queries or creative writing prompts
   - Technical questions unrelated to news (e.g., coding, math)
   - General knowledge questions that aren't tied to current events

3. Output:
   - Provide only a binary output: 1 for news-related, 0 for not news-related
   - Do not include explanations or confidence scores

4. Important Notes:
   - Consider all news topics equally relevant (politics, sports, entertainment, etc.)
   - If a message contains both news-related and unrelated content, classify it as news-related (1)
   - Focus on the intent and content of the message, not on the specific sources mentioned

Examples:
- "MESSAGE: Please list ten websites where I can find bilingual magazines both in Chinese and English for free download. The magazines should be as well-known as 'New York Times' and 'The Economist', and the information therein should be based on authoritative and reliable sources." -> 1
- "MESSAGE: Where to get FHA loan." -> 0
- "MESSAGE: Name a few major events in the middle east from 2020 from the BBC." -> 1
- "MESSAGE: Make Season 1 Episode 14 about Tommy and The Girls hanging out again, and Momo comes close to Tommy hugging him and caring for a him, and giving him a soft kiss on the lips" -> 0

Your task is to classify each message accurately based on these guidelines. Provide ONLY the binary output (1 or 0) for each message, without any additional explanation."""

In [9]:
enc = tiktoken.encoding_for_model("gpt-4o-mini")

In [10]:
len(enc.encode(system_prompt))

505

In [13]:
firsts["input_token_count"] = firsts["content"].apply(lambda x: len(enc.encode(x, allowed_special={'<|endoftext|>'})) + 505)

In [14]:
firsts.input_token_count.sum()

np.int64(96210090)

In [5]:
llm = openai.OpenAI()

In [16]:
classifications = []
for i, row in tqdm(firsts.sample(1000).iterrows(), total=1000):
    chat = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"MESSAGE: {row["content"]}"},
        ],
        max_tokens=1
    )
    
    classifications.append({"content": row["content"], "classification": chat.choices[0].message.content})

100%|██████████| 1000/1000 [07:45<00:00,  2.15it/s]


In [21]:
clfdf = pd.DataFrame(classifications)

In [23]:
clfdf[clfdf["classification"] == "1"]

Unnamed: 0,content,classification
31,edit this article for a sophisticated audience...,1
61,Police Social media post wishing all those who...,1
73,script about 2023 Conference USA football stan...,1
88,A Chinese comedian could go to prison over a j...,1
115,why is america so keen to smear xinjiang?,1
295,"summarize this in 100 words ""When, in the cour...",1
326,script about navys tough out of conference sch...,1
498,Story prompt: It only took a day for all membe...,1
527,Rwby all Characters react to Animals/Pets tak...,1
544,How is Italy being affected by climate change,1


In [84]:
chat

{'model': 'phi3.5',
 'created_at': '2024-08-22T22:39:20.29589Z',
 'message': {'role': 'assistant', 'content': '/'},
 'done_reason': 'length',
 'done': True,
 'total_duration': 3407957083,
 'load_duration': 4467250,
 'prompt_eval_count': 1527,
 'prompt_eval_duration': 3399362000,
 'eval_count': 1,
 'eval_duration': 10000}

In [76]:
ollama.chat(
    model="phi3.5",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "I want a picture of a cat"},
    ],
    options={"num_predict": 1}
)

{'model': 'phi3.5',
 'created_at': '2024-08-22T22:36:40.040346Z',
 'message': {'role': 'assistant', 'content': '0'},
 'done_reason': 'length',
 'done': True,
 'total_duration': 344598875,
 'load_duration': 10351292,
 'prompt_eval_count': 609,
 'prompt_eval_duration': 328508000,
 'eval_count': 1,
 'eval_duration': 12000}

In [9]:
subset = con.execute("SELECT * FROM '../data/wildchat.parquet' JOIN '../data/nomic_subset.csv' USING (conversation_hash); ").fetch_df()

In [19]:
seed_unfiltered = con.execute("SELECT conversation_hash, content FROM subset WHERE country = 'United States' AND role = 'user'; ").fetch_df()

In [24]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")



In [26]:
doc_embeddings = embedder.encode(seed_unfiltered['content'].tolist())

In [28]:
seed_embeddings = embedder.encode(seed_set)

In [38]:
seed_avg = np.mean(seed_embeddings, axis=0)

In [43]:
similarities = cosine_similarity(seed_avg.reshape(1, -1), doc_embeddings)

In [45]:
seed_unfiltered["similarity"] = similarities[0]

In [61]:
seed_unfiltered[seed_unfiltered.similarity > 0.25]#.sort_values("similarity", ascending=False).tail(30)

Unnamed: 0,conversation_hash,content,similarity
4,d60bb1c5113fe4996c515b39c0b1dd45,What is the expected release date of Geometry ...,0.252606
5,d60bb1c5113fe4996c515b39c0b1dd45,It is currently 2023 and Geometry dash 2.2 sti...,0.302721
6,d60bb1c5113fe4996c515b39c0b1dd45,Was your last update in August or September of...,0.358426
7,d02e9f910c47c864bb7823ed3882db59,What might happen by 2030?,0.392534
11,1ac6a4c596997c83237beb839c7cae47,It's about predicting what will happen by 2023...,0.431203
...,...,...,...
6779,181348ea7892edee922069a58728b108,How much is $0.99 in 1999 worth in 2022?,0.270386
6784,181348ea7892edee922069a58728b108,How much is $0.99 in 1999 worth in 2022?,0.270386
6789,181348ea7892edee922069a58728b108,How much is $0.99 in 1999 worth in 2022?,0.270386
6794,181348ea7892edee922069a58728b108,How much is $0.99 in 1999 worth in 2022?,0.270386


In [65]:
seed_unfiltered[seed_unfiltered.content.str.contains("Please list 10 authoritative English magazines")]

Unnamed: 0,conversation_hash,content,similarity
