<a href="https://colab.research.google.com/github/Maspie/Ranking-Clustering-Enhancing-Experience-of-Micro-Blogging-Sites/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install openai



In [22]:
!pip install langchain



# Getting saved Embeddings and cleaned tweet dataset

In [23]:
import numpy as np
import pandas as pd
import requests
from sklearn.metrics.pairwise import cosine_similarity

# Function to download a file from Google Drive
def download_file_from_google_drive(url, local_dest):
    response = requests.get(url)
    with open(local_dest, 'wb') as file:
        file.write(response.content)

npy_url = 'https://drive.google.com/uc?id=1AeAVk_UX9drv3yt2ooGOq5ODed2YlQgZ&export=download'
csv_url = 'https://drive.google.com/uc?id=1yh9s5SJ5SgEdoVbA_6OjCOFwUs5AQySX&export=download'

# Download and load embeddings
npy_local_path = 'temp_file.npy'
download_file_from_google_drive(npy_url, npy_local_path)
loaded_embeddings = np.load(npy_local_path)

# Download and load CSV data
data = pd.read_csv(csv_url)
data['embeddings'] = list(loaded_embeddings)


# Recommendation and Ranking of tweets

In [24]:
def recommend_tweets(cluster_data, selected_tweet_embedding, selected_tweet_index):
    """Recommend the top 5 similar tweets based on distance, excluding the selected tweet."""
    cluster_data = cluster_data.copy()
    # Calculate the distance for each tweet in the cluster
    cluster_data['distance'] = cluster_data['embeddings'].apply(lambda emb: np.linalg.norm(emb - selected_tweet_embedding))
    # Exclude the currently selected tweet from the recommendations
    filtered_data = cluster_data[cluster_data.index != selected_tweet_index]
    # Return the top 5 tweets with the smallest distance, excluding the current tweet
    return filtered_data.nsmallest(5, 'distance')



# Custom Retriever

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def retrieve_similar_tweets(cluster_data, selected_tweet_embedding, threshold=0.8):
    """Retrieve tweets with at least 80% similarity to the selected tweet based on embeddings.

    Args:
    cluster_data (DataFrame): DataFrame containing tweets and their embeddings.
    selected_tweet_embedding (np.array): Embedding of the selected tweet.
    threshold (float): The similarity threshold.

    Returns:
    DataFrame: Filtered tweets with at least the specified similarity.
    """
    # Calculate cosine similarity between the selected tweet embedding and all embeddings in the cluster
    similarities = cosine_similarity([selected_tweet_embedding], cluster_data['embeddings'].tolist())[0]

    # Filter tweets based on the similarity threshold
    return cluster_data[similarities >= threshold]


# Get main words from cluster/tweets

In [26]:
from wordcloud import WordCloud

def create_word_cloud_string(tweets):
    """Create a single string from the most common words in the tweets using a word cloud.

    Args:
    tweets (Series): Series containing the text of the tweets.

    Returns:
    str: A single string representing the most common words.
    """
    # Combine all tweets into one large text
    combined_text = ' '.join(tweets)

    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400).generate(combined_text)

    # Extract words from the word cloud
    words = list(wordcloud.words_.keys())

    # Return a single string composed of these words
    return ' '.join(words)


In [27]:
import os

open_api_key= "sk-proj-IMaWGJ7t6P0rV6b5n9jBT3BlbkFJ7XnLJQX45rp9ZGGIoQhz"
os.environ["OPENAI_API_KEY"] = open_api_key

# Summary Generation

In [28]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage

def generate_summary(text):
    """Generate a summary using OpenAI's API

    Args:
    text (str): Text to summarize.

    Returns:
    str: Generated summary.
    """
    chat_messages = [
        SystemMessage(content='You are an expert assistant with expertise in summarizing what different words say'),
        HumanMessage(content=f'Please provide a short and concise summary of the following text:\n TEXT: {text}')
    ]

    llm = ChatOpenAI(model_name='gpt-3.5-turbo')
    return llm(chat_messages).content


# Iterative Kmeans, Ranking, Recommendation, Retrieval Augmented Generation

In [30]:
import textwrap
from sklearn.cluster import KMeans

def main_process(data, max_depth=3):
    kmeans = KMeans(n_clusters=10, random_state=42)
    data['cluster'] = kmeans.fit_predict(list(data['embeddings']))

    initial_index = np.random.choice(data.index)
    selected_tweet = data.loc[initial_index]
    process_tweet_selection(data, selected_tweet, 0, max_depth)

def print_summary(summary):
    """Print the summary in a formatted way to enhance readability."""
    wrapped_text = textwrap.fill(summary, width=100)
    print("\nCluster Summary:")
    print(wrapped_text)


def process_tweet_selection(data, selected_tweet, depth, max_depth):
    print("\nSelected Tweet:", selected_tweet['text'])
    cluster_data = data[data['cluster'] == selected_tweet['cluster']].copy()
    similar_tweets = retrieve_similar_tweets(cluster_data, selected_tweet['embeddings'])
    word_cloud_string = create_word_cloud_string(similar_tweets['text'])
    cluster_summary = generate_summary(word_cloud_string)
    print_summary(cluster_summary)

    recommended_tweets = recommend_tweets(similar_tweets, selected_tweet['embeddings'], selected_tweet.name)
    print("\nTop 5 Recommended Tweets:")
    for i, tweet in enumerate(recommended_tweets['text'], 1):
        print(f"{i}. {tweet}")

    if len(cluster_data) > 30 and depth < max_depth:
        selected_option = int(input("Select a tweet (1-5): ")) - 1
        new_selected_tweet_index = recommended_tweets.iloc[selected_option].name
        new_selected_tweet = data.loc[new_selected_tweet_index]
        process_tweet_selection(data, new_selected_tweet, depth + 1, max_depth)
    else:
        print("\nProcess finished, fewer than 30 tweets left in the cluster or maximum depth reached.")




main_process(data)





Selected Tweet: Musician Kalle Mattson Recreates 34 Classic Album Covers in Clever Music Video for Û÷AvalancheÛª http://t.co/VBSwhz4s2V

Cluster Summary:
The text mentions a music video by musician Kalle Mattson that recreates a classic album cover, a
NHL hockey shirt, a blog post about a little piece of music, and various music references including
artists like BTS, Erykah Badu, and Smashing Pumpkins. There are also mentions of a video game and a
podcast episode called "The Piano Entertainer Ep – Collide."

Top 5 Recommended Tweets:
1. Musician Kalle Mattson Recreates 34 Classic Album Covers in Clever Music Video for 'Avalanche' http://t.co/yDJpOpH1DW
2. I liked a @YouTube video http://t.co/TNXQuOr1wb Kalle Mattson - 'Avalanche' (Official Video)
3. What a feat! Watch the #BTS of @kallemattson's incredible music video for #Avalanche: https://t.co/3W6seA9tuv ????
4. Great one time deal on all Avalanche music and with purchase get a Neal Rigga shirt http://t.co/4VIRXkgMpC
5. Avalanche