In [2]:
from sklearn.cluster import KMeans
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import numpy as np

import datetime
import random


client = OpenAI()
tqdm.pandas()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [17]:
questions = [
    "What's your favorite game to stream and why?",
    "Any tips for new streamers just starting out?",
    "What setup do you use for streaming?",
    "How do you deal with trolls in chat?",
    "Can you explain your stream schedule?",
    "What was your most memorable streaming moment?",
    "How do you choose what games to play on stream?",
    "What's the best way to support your channel?",
    "How long have you been streaming?",
    "What do you enjoy most about streaming?",
    "Do you have any streaming rituals or routines?",
    "How do you stay motivated to stream regularly?",
    "What's the hardest part about being a streamer?",
    "Any advice on building a community on Twitch?",
    "What are your thoughts on the latest game update?",
    "How do you balance streaming with personal life?",
    "What's your favorite streaming moment with viewers?",
    "How did you come up with your streamer name?",
    "What games are you looking forward to streaming next?",
    "Do you collaborate with other streamers?"
]

# Example output
for question in questions:
    print(question)


What's your favorite game to stream and why?
Any tips for new streamers just starting out?
What setup do you use for streaming?
How do you deal with trolls in chat?
Can you explain your stream schedule?
What was your most memorable streaming moment?
How do you choose what games to play on stream?
What's the best way to support your channel?
How long have you been streaming?
What do you enjoy most about streaming?
Do you have any streaming rituals or routines?
How do you stay motivated to stream regularly?
What's the hardest part about being a streamer?
Any advice on building a community on Twitch?
What are your thoughts on the latest game update?
How do you balance streaming with personal life?
What's your favorite streaming moment with viewers?
How did you come up with your streamer name?
What games are you looking forward to streaming next?
Do you collaborate with other streamers?


# split questions and the rest

In [18]:
def get_questions(chat_samples):
    questions = [sentence for sentence in chat_samples if '?' in sentence]
    return questions

questions = get_questions(questions)

# embeddings

In [19]:
def embed_sentence(sentence, model):
    response = client.embeddings.create(
        input=sentence,
        model=model
    )
    return response.data[0].embedding

df = pd.DataFrame(questions, columns=["question"])
df["embedding"] = df["question"].progress_apply(lambda x: embed_sentence(x, "text-embedding-3-small"))
matrix = np.vstack(df.embedding.values)

100%|██████████| 20/20 [00:11<00:00,  1.70it/s]


# cluster

In [20]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df["Cluster"] = labels



In [32]:
df['Cluster'].value_counts()

Cluster
1    11
2     7
0     2
Name: count, dtype: int64

In [31]:
rev_per_cluster = 2

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")

    reviews = "\n".join(
        df[df.Cluster == i].sample(rev_per_cluster, random_state=42).question.values
    )

    messages = [
        {"role": "user", "content": f'What does these questions from a stream chat does have in common? \n\n{reviews} \n \n create a summary of the questions. \n \n summary :'},
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0)
    print(response.choices[0].message.content.strip())

    # Assuming 'Score' and 'Summary' columns exist in your DataFrame, adjust accordingly
    sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
    
    for j in range(rev_per_cluster):
        print(sample_cluster_rows.iloc[j]['question'][:70])  # Adjust based on your actual column names

    print("-" * 100)


Cluster 0 Theme: The questions are both related to strategies for improving and supporting a Twitch channel.
Any advice on building a community on Twitch?
What's the best way to support your channel?
----------------------------------------------------------------------------------------------------
Cluster 1 Theme: The questions are both related to strategies for improving and supporting a Twitch channel.
How do you stay motivated to stream regularly?
Any tips for new streamers just starting out?
----------------------------------------------------------------------------------------------------
Cluster 2 Theme: The questions are both seeking advice on improving and supporting a Twitch channel.
What's your favorite game to stream and why?
What setup do you use for streaming?
----------------------------------------------------------------------------------------------------


# VIZ

In [16]:
import plotly.graph_objects as go
import numpy as np

from sklearn.manifold import TSNE
import numpy as np


# Initialize and fit t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_embedded = tsne.fit_transform(matrix)

# 'X_embedded' is the transformed data matrix with reduced dimensions (n_samples, n_components)


tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

# Splitting the transformed data
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

# Creating a Plotly figure
fig = go.Figure()

# Colors for each category/cluster
colors = ["purple", "green", "red", "blue"]

for category, color in enumerate(colors):
    # Filter points belonging to the current category/cluster
    xs = np.array(x)[df.Cluster == category]
    ys = np.array(y)[df.Cluster == category]
    
    # Add scatter plot for points in the cluster
    fig.add_trace(go.Scatter(x=xs, y=ys, mode='markers', name=f'Cluster {category}',
                             marker=dict(color=color, opacity=0.3, size=5)))
    
    # Calculate the average position for the cluster center
    avg_x = np.mean(xs)
    avg_y = np.mean(ys)
    
    # Add scatter plot for the cluster center
    fig.add_trace(go.Scatter(x=[avg_x], y=[avg_y], mode='markers', name=f'Center {category}',
                             marker=dict(color=color, size=10, symbol="x")))

# Updating the layout of the figure
fig.update_layout(title="Clusters identified visualized in 2D using t-SNE with Plotly",
                  xaxis_title="t-SNE Dimension 1",
                  yaxis_title="t-SNE Dimension 2",
                  legend_title="Clusters")

# Show the figure
fig.show()



Mean of empty slice.


invalid value encountered in divide



In [33]:
from sklearn.cluster import KMeans
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

# Load environment variables from .env file
load_dotenv()

# Enable tqdm pandas
tqdm.pandas()

OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

def get_questions(chat_samples):
    questions = [sentence for sentence in chat_samples if '?' in sentence]
    return questions

def initialize_df(questions):
    df = pd.DataFrame(questions, columns=["question"])
    return df

def embed_sentence(sentence, model):
    response = client.embeddings.create(
        input=sentence,
        model=model
    )
    return response.data[0].embedding

def k_means_clustering(matrix, n_clusters = 5):
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_
    return labels
    

def create_cluster_name(df, n_clusters, rev_per_cluster=5):
    cluster_themes = []



    for i in range(n_clusters):

        cluster_size = df[df.Cluster == i].shape[0]
        if cluster_size < rev_per_cluster:
            rev_per_cluster = cluster_size

        questions = "\n".join(
            df[df.Cluster == i].sample(rev_per_cluster, random_state=42).question.values
        )

        messages = [
            {"role": "user", "content": f'What do the following questions in a stream channel have in common?\n\nUser question:\n"""\n{questions}\n"""\n\nTheme:'},
        ]

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0,
            max_tokens=64,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0)

        theme = response.choices[0].message.content.strip()
        questions_list = questions.split("\n")

        cluster_themes.append({
            "cluster": i,
            "theme": theme,
            "questions": questions_list
        })

    return json.dumps(cluster_themes, ensure_ascii=False, indent=2)

input = [
    "What's your favorite game to stream and why?",
    "Any tips for new streamers just starting out?",
    "What setup do you use for streaming?",
    "How do you deal with trolls in chat?",
    "Can you explain your stream schedule?",
    "What was your most memorable streaming moment?",
    "How do you choose what games to play on stream?",
    "What's the best way to support your channel?",
    "How long have you been streaming?",
    "What do you enjoy most about streaming?",
    "Do you have any streaming rituals or routines?",
    "How do you stay motivated to stream regularly?",
    "What's the hardest part about being a streamer?",
    "Any advice on building a community on Twitch?",
    "What are your thoughts on the latest game update?",
    "How do you balance streaming with personal life?",
    "What's your favorite streaming moment with viewers?",
    "How did you come up with your streamer name?",
    "What games are you looking forward to streaming next?",
    "Do you collaborate with other streamers?"
    ]

# get the last n_samples from the input
input = input[-100:]

# get the questions from the input
questions = get_questions(input)

# initialize the dataframe
df = initialize_df(questions)

# embed the questions
df["embedding"] = df["question"].progress_apply(lambda x: embed_sentence(x, "text-embedding-3-small"))



100%|██████████| 20/20 [00:38<00:00,  1.91s/it]


In [35]:
df

Unnamed: 0,question,embedding
0,What's your favorite game to stream and why?,"[0.024319134652614594, -0.05626320838928223, -..."
1,Any tips for new streamers just starting out?,"[0.03445218876004219, -0.010319043882191181, -..."
2,What setup do you use for streaming?,"[-0.0057601905427873135, -0.04952836036682129,..."
3,How do you deal with trolls in chat?,"[-0.020598677918314934, -0.03679736703634262, ..."
4,Can you explain your stream schedule?,"[0.022722115740180016, -0.011309947818517685, ..."
5,What was your most memorable streaming moment?,"[0.037777986377477646, -0.07231466472148895, 0..."
6,How do you choose what games to play on stream?,"[0.0034936275333166122, -0.03109895810484886, ..."
7,What's the best way to support your channel?,"[0.021202312782406807, -0.02188274636864662, -..."
8,How long have you been streaming?,"[0.02557915262877941, -0.05364077165722847, 0...."
9,What do you enjoy most about streaming?,"[0.04390798136591911, -0.036281563341617584, 0..."


In [36]:
# cluster the questions
matrix = np.vstack(df["embedding"].values)
labels = k_means_clustering(matrix, n_clusters)
df["Cluster"] = labels



In [39]:
df['Cluster'].value_counts()

Cluster
1    11
2     7
0     2
Name: count, dtype: int64

In [38]:
# create the cluster names and return the result
result = create_cluster_name(df, n_clusters, rev_per_cluster = 5)

print(result)

[
  {
    "cluster": 0,
    "theme": "Growing and Supporting a Twitch Channel",
    "questions": [
      "Any advice on building a community on Twitch?",
      "What's the best way to support your channel?"
    ]
  },
  {
    "cluster": 1,
    "theme": "Streaming Advice/Tips",
    "questions": [
      "How do you stay motivated to stream regularly?",
      "Any tips for new streamers just starting out?"
    ]
  },
  {
    "cluster": 2,
    "theme": "Streaming Preferences and Setup",
    "questions": [
      "What's your favorite game to stream and why?",
      "What setup do you use for streaming?"
    ]
  }
]


In [42]:
cluster_themes = []



for i in range(n_clusters):

    cluster_size = df[df.Cluster == i].shape[0]
    print(cluster_size)
    if cluster_size < rev_per_cluster:
        rev_per_cluster = cluster_size

    print(rev_per_cluster)

2
2
11
2
7
2
