In [None]:
import psycopg2, os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def connect_to_database():
    try:
        # Load environment variables
        db_host = os.getenv("DB_HOST")
        db_port = os.getenv("DB_PORT")
        db_name = os.getenv("DB_NAME")
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")
        
        # Connect to the database
        db_connection = psycopg2.connect(
            host=db_host,
            port=db_port,
            database=db_name,
            user=db_user,
            password=db_password,
            cursor_factory=psycopg2.extras.DictCursor
        )
        return db_connection
    except psycopg2.Error as e:
        print(f"Error connecting to database: {e}")
        return None

In [None]:
from datetime import datetime, timedelta

def fetch_articles(db_connection, hours=24):
    try:
        # Define the time range for the last X hours from the current time
        current_time = datetime.now()
        time_x_hours_ago = current_time - timedelta(hours=hours)

        # Create a cursor object
        cursor = db_connection.cursor()

        # Execute a SELECT query to retrieve all scraped articles within the specified time range
        query = "SELECT * FROM scraped_articles WHERE publish_date >= %s"
        
        # Execute the query
        cursor.execute(query, (time_x_hours_ago,))
        
        # Fetch all the results
        all_articles_data = cursor.fetchall()
        
        # List to store articles
        articles = []
        
        # Extract articles' text
        for article_data in all_articles_data:
            articles.append(article_data['data'])
                
        return articles
            
    except psycopg2.Error as e:
        print(f"Error executing query: {e}")
        return None
    finally:
        # Close cursor
        if cursor is not None:
            cursor.close()

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
from openai import OpenAI
from enum import Enum
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

client = OpenAI(api_key=OPEN_AI_API_KEY)

Embedding = Enum("Embedding", ["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"])

def get_embeddings(text: str, model: Embedding):
    res = client.embeddings.create(
        model=model,
        input=text,
        encoding_format="float"
    )
    return res.data[0].embedding

In [None]:
from sklearn.cluster import AgglomerativeClustering

def make_clusters(similarity_matrix, similarity_threshold, articles):
    """
    Groups articles into clusters based on a similarity matrix and a similarity threshold.

    Args:
        similarity_matrix (ndarray): Matrix representing the pairwise similarities between articles.
        similarity_threshold (float): Threshold value for clustering.
        articles (list): List of articles

    Returns:
        list: List of clusters, where each cluster is a list of articles.
    """
    # Initialize the AgglomerativeClustering model with the specified threshold and linkage criterion.
    # This setup allows the number of clusters to be determined based on the similarity threshold rather than being predefined.
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=similarity_threshold, linkage='average')
    
    # Apply the clustering model to the similarity matrix to identify clusters.
    # The fit_predict method returns an array where each element corresponds to the cluster assignment of each article.
    clusters = clustering_model.fit_predict(similarity_matrix)

    # Initialize a list of empty lists to hold the articles for each cluster.
    # The number of clusters is determined by the unique cluster identifiers in the 'clusters' array.
    clustered_articles = [[] for _ in range(len(set(clusters)))]
    
    # Iterate over each article's group the articles accordingly.
    for i in range(len(articles)):
        # Append each article to its corresponding cluster based on the cluster assignment.
        # The 'articles[i]' part assumes there's an 'articles' list available in the scope where each article's index corresponds to its position in the similarity matrix.
        clustered_articles[clusters[i]].append(articles[i])

    # Return the list of article clusters, where each cluster is represented as a list of articles.
    return clustered_articles

In [None]:
def summarize_cluster(cluster):
    # combine all articles in cluster into one string
    # so that we can summarize them together using the OpenAI API
    cluster_string = " ".join(cluster)
    # get the number of tokens in the cluster
    num_tokens = num_tokens_from_string(cluster_string, "cl100k_base")
    # summarize the cluster
    summary = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-4",
        messages=[
            {"role": "system", "content": '''   Please summarize the following Hebrew article,
                                                focusing on the main points, arguments, and conclusions.
                                                Highlight any significant data, quotes, or statistics mentioned,
                                                and note the context in which they are presented. 
                                                If the article discusses multiple perspectives or debates,
                                                please outline these distinctly. Additionally,
                                                if there are any implications or recommendations made by the author,
                                                include these in the summary.
                                                Finally, if the article references specific events, individuals, or sources, 
                                                please identify these and their relevance to the article's overall message.
                                                pay attention to correct syntax and grammar in Hebrew'''},
            {"role": "user", "content": f"Article: {cluster_string}"},
        ],
        max_tokens=int(num_tokens * 0.1),
        # response_format={"type": "json_object"},
    )
    return summary

In [None]:
####### Develop from here...
################################################################
################################################################

def insert_summarized_article(db_connection, title, article, category):
    try:
        # Create a cursor object
        cursor = db_connection.cursor()
        
        # SQL query to insert the summarized article into the merged_articles table
        insert_query = """
        INSERT INTO merged_articles (title, article, category) 
        VALUES (%s, %s, %s)
        """
        
        # Execute the query with the provided title, text, and category
        cursor.execute(insert_query, (title, article, category))
        
        # Commit the changes to the database
        db_connection.commit()
        print("Article inserted successfully into merged_articles table.")
        
    except psycopg2.Error as e:
        # Rollback in case of any error
        db_connection.rollback()
        print(f"Error inserting article into database: {e}")
    finally:
        # Close the cursor
        if cursor is not None:
            cursor.close()

In [None]:
# Connect to the database
db_connection = connect_to_database()

if db_connection:
    # Fetch articles by category
    articles = fetch_articles(db_connection)
    
    for article in articles:
        print(article)

    # Close database connection
    db_connection.close()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
for model in Embedding:
    embeddings = [get_embeddings(article, model.name) for article in articles]
    similarities = cosine_similarity(embeddings)
    print(f"-----Model: {model.name}-----")
    print(similarities.round(3))
    print("")

In [None]:
clusters = make_clusters(similarities, 0.6, articles)
for i, cluster in enumerate(clusters):
    print(f"-----Cluster {i}-----")
    for article in cluster:
        print(article)
        print("")
    print("")

In [None]:
# for each cluster, combine the articles into one string and summarize it using the summarizer function
summaries = [summarize_cluster(cluster) for cluster in clusters]
for i, summary in enumerate(summaries):
    print(f"-----Cluster {i}-----")
    for line in summary.choices[0].message.content.split("."):
        print(line)
        print("")
    print("")