In [1]:
import random
import openai
from semantic_text_splitter import TextSplitter
import json
import os
from dotenv import load_dotenv
from openai import OpenAI

client = OpenAI()


def split_and_merge_text(text, min_size=2500, max_size=3000):
    """
    Splits the text using a TextSplitter and merges smaller chunks to meet size constraints.

    Args:
        text (str): The input text to be split.
        min_size (int): The minimum size for each chunk.
        max_size (int): The maximum size for each chunk.

    Returns:
        list: A list of merged chunks that meet the size constraints.
    """
    # Initialize the splitter with the chunk size range
    splitter = TextSplitter((min_size, max_size))

    # Split the text into initial chunks
    initial_chunks = splitter.chunks(text)

    # Merge chunks smaller than the minimum size
    merged_chunks = []
    current_chunk = ""
    for chunk in initial_chunks:
        if len(current_chunk) + len(chunk) <= max_size:
            current_chunk += chunk
        else:
            if current_chunk:
                merged_chunks.append(current_chunk)
            current_chunk = chunk

    if current_chunk:
        merged_chunks.append(current_chunk)

    # Ensure all chunks meet the minimum size requirement
    final_chunks = []
    current_chunk = ""
    for chunk in merged_chunks:
        if len(chunk) < min_size:
            current_chunk += chunk
        else:
            if current_chunk:
                final_chunks.append(current_chunk)
                current_chunk = ""
            final_chunks.append(chunk)

    if current_chunk:
        final_chunks.append(current_chunk)

    return final_chunks

def extract_title_and_author(text):
    """
    Extracts the title and author from the first few lines of a given text.

    Args:
        text (str): The input text containing title and author information.

    Returns:
        tuple: A tuple containing the title and author as strings.
    """
    title = None
    author = None

    # Split the text into lines and check the first few lines
    lines = text.splitlines()
    for line in lines[:5]:  # Look at the first few lines for these fields
        if line.startswith("Titre :"):
            title = line.split(":", 1)[1].strip()
        elif line.startswith("Auteur :"):
            author = line.split(":", 1)[1].strip()

    return title, author

# Function to retrieve a specific insight from a chunk
def retrieve_insight(chunk, author, title):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": (
                "You are an analytical and thought-provoking assistant specializing in extracting deep insights from social science texts. "
                "Your goal is to identify nuanced perspectives, uncover underlying themes, and present information in an engaging and intellectually stimulating manner. "
                "Focus on creating clear, concise, and structured breakdowns of ideas, avoiding mainstream entertainment references."
            )},
            {
                "role": "user",
                "content": (
                    f"""Le texte suivant est extrait de "{title}", écrit par {author}.
                    En mettant l'accent sur la perspective de {author}, identifie l'idée générale qu'il exprime selon son point de vue.
                    Ensuite, décompose cette idée en étapes successives, chacune correspondant à une phrase courte.
                    Chaque phrase doit décrire une étape claire et concise pour arriver à l'idée générale, tout en s'appuyant sur des détails du texte.
                    Introduit chaque étape par "1.", "2."...
                    Si une phrase est trop longue, divise-la en deux phrases courtes.
                    Voici le texte :\n\n {chunk}\n\n
                    Étapes :"""
                )
            }
        ],
        max_tokens=500,
        temperature=0.7
    )
    return completion.choices[0].message.content

def get_unique_filename(base_name):
    # Split the base name into name and extension
    name, ext = os.path.splitext(base_name)
    counter = 0
    new_name = base_name
    # Increment counter until a non-existing filename is found
    while os.path.exists(new_name):
        counter += 1
        new_name = f"{name}_{counter}{ext}"
    return new_name

def save_all_insights_to_json(output_file, insights):
    # Get a unique filename for the output file
    output_file = get_unique_filename(output_file)

    # Load existing data if the file exists
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {}

    # Update the existing data with the new insights
    data.update(insights)

    # Save the updated data back to the file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    # Update the pointer file with the path to the latest JSON file
    pointer_file = "config.json"
    with open(pointer_file, "w", encoding="utf-8") as f:
        json.dump({"latest_file": output_file}, f, ensure_ascii=False, indent=4)



# Load environment variables from .env file
load_dotenv()

# Get the API key
openai.api_key = os.getenv("OPENAI_API_KEY")


# Initialize the "all_chunks" variable to store data
all_chunks = []

# Path to the folder containing text files
folder_path = "Book"

# Loop through all .txt files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        # Read the text file
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Create the chunks for this file
        chunks = split_and_merge_text(text, min_size=1500, max_size=3000)

        # Get the title and author from the text
        title, author = extract_title_and_author(text)

        # Add chunks to all_chunks with their associated title and author
        for chunk in chunks:
            all_chunks.append({
                "title": title,
                "author": author,
                "chunk": chunk,
                "size": len(chunk),
            })


print("Lenght of all chunk : " + str(len(all_chunks)))


# Randomly select 20 chunks from "all_chunks"
random_chunks = random.sample(all_chunks, min(25, len(all_chunks)))

# Assuming all_chunks is already defined
titles_and_authors = [
    {"title": chunk["title"], "author": chunk["author"], "size": chunk["size"]}
    for chunk in random_chunks
]

# Display the unique titles, authors, and sizes
for item in titles_and_authors:
    print(f"Title: {item['title']}, Author: {item['author']}, Size: {item['size']} characters")

Lenght of all chunk : 2136
Title: La question anthropologique (Cours 1954-1955), Author: Michel Foucault, Size: 2308 characters
Title: Sainte Beuve, Biographie, Author: A. J. Pons, Size: 2360 characters
Title: Histoire du structuralisme, Tome 2, Author: François Dosse, Size: 2083 characters
Title: Franz Kafka, Author: Bernard Lahire, Size: 1516 characters
Title: La question anthropologique (Cours 1954-1955), Author: Michel Foucault, Size: 1613 characters
Title: Franz Kafka, Author: Bernard Lahire, Size: 2968 characters
Title: Histoire du structuralisme, Tome 1, Author: François Dosse, Size: 1955 characters
Title: La question anthropologique (Cours 1954-1955), Author: Michel Foucault, Size: 2083 characters
Title: Franz Kafka, Author: Bernard Lahire, Size: 2482 characters
Title: Franz Kafka, Author: Bernard Lahire, Size: 2365 characters
Title: La fabrique des sciences sociales, Author: Johann Michel, Size: 2872 characters
Title: Sainte Beuve, Biographie, Author: A. J. Pons, Size: 2044 ch

In [2]:
print(random_chunks[0])

{'title': 'La question anthropologique (Cours 1954-1955)', 'author': 'Michel Foucault', 'chunk': '3. Mais il faut tenir compte [, aussi,] du fait que les esprits ont dans le monde une pesanteur ontologique particulière (due au fait qu’ils peuvent « entrer en conversation ou du moins en société », Discours de métaphysique…, 35104) et que le monde le plus parfait ne peut être que le monde où ils peuvent acquérir le plus de perfection. Donc la subordination ontologique de la matière à l’esprit a pour suite la subordination de la perfection et du bonheur de la matière au bonheur et à la perfection de l’esprit. « La république générale des esprits est la plus noble partie de l’univers » (Leibniz, Correspondance avec Arnauld105).\n\n\n\nCe qui a pour conséquences que :\n\n– « De toutes les créatures qui nous environnent, il n’y a que l’esprit de l’homme qui soit susceptible d’un vrai bonheur » (Dialogue entre Polidore et Théophile106). Les autres (les âmes sans esprit) ne sont capables ni de

### Retrieve list of insights from chunks

In [32]:
# Process each chunk and collect all insights
all_insights = {}
for i, chunk in enumerate(random_chunks, start=1):
    # Retrieve an insight from the chunk
    insight = retrieve_insight(chunk['chunk'], chunk['author'], chunk['title'])

    # Parse the insight into individual points
    points = [line.strip() for line in insight.strip().split('\n') if line]

    # Construct the JSON structure for this entry
    all_insights[str(i)] = {
        "Title and author": f"{chunk['title']}, {chunk['author']}",
        "Content": points
    }

# Save all collected insights into a single JSON file
save_all_insights_to_json("Insights/insights.json", all_insights)

### Retrieve questions from chunks

In [None]:
# Function to retrieve a specific insight from a chunk
def retrieve_question(chunk, author, title):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": (
                "You are an analytical and thought-provoking assistant specializing in extracting deep insights from social science texts avoiding mainstream entertainment references. "
                "Your goal is to identify nuanced perspectives, uncover underlying themes, and present information in an engaging and intellectually stimulating manner. "
            )},
            {
                "role": "user",
                "content": (
                    f"""Le texte suivant est extrait de "{title}", écrit par {author}.
                    Résume en une phrase une des questions que pose ce texte.
                    Voici le texte :\n\n {chunk}\n\n
                    Étapes :"""
                )
            }
        ],
        max_tokens=500,
        temperature=0.7
    )
    return completion.choices[0].message.content

In [25]:
import os

# Initialize the "all_chunks" variable to store data
all_chunks = []

# Path to the folder containing text files
folder_path = "Book"

# Loop through all .txt files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        # Read the text file
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Create the chunks for this file
        chunks = split_and_merge_text(text, min_size=1500, max_size=3000)

        # Get the title and author from the text
        title, author = extract_title_and_author(text)

        # Add chunks to all_chunks with their associated title and author
        for chunk in chunks:
            all_chunks.append({
                title,
                author,
                chunk
            })

# Check the first few entries to confirm structure
all_chunks[:3]


[{'Bernard Lahire',
  'Franz Kafka, Chapitre 1',
  'Titre : Franz Kafka, Chapitre 1\nAuteur : Bernard Lahire\n\n\nChapitre 1\nL’enfermement dans le champLa seule manière de se défaire réellement de problèmes scientifiques, si l’on considère qu’une théorie sociologique est essentiellement un univers cohérent de problèmes-solutions articulés, c’est de les affronter, de les faire travailler, de les soumettre à examens, pour finalement les dépasser en découvrant leurs limites de validité et leur champ de pertinence. Partant d’une telle conception de la pratique scientifique, on ne peut qu’être d’accord avec l’analyse de Thomas S. Kuhn selon laquelle «\u2005seules les investigations fermement enracinées dans la tradition scientifique contemporaine ont une chance de briser cette tradition et de donner naissance à une nouvelle1\u2005». Et l’épistémologue ajoutait à la suite cette proposition d’une grande justesse : «\u2005Le savant productif doit être un traditionaliste qui aime à s’adonner à

In [12]:
# Extract title and author from the text
with open('Book/LOrdre matériel du savoir.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

title = None
author = None

# Extract title and author
for line in lines[:5]:  # Look at the first few lines for these fields
    if line.startswith("Titre :"):
        title = line.split(":", 1)[1].strip()
    elif line.startswith("Auteur :"):
        author = line.split(":", 1)[1].strip()
        
print(author)
print(title)


Françoise Waquet
L'ordre matériel du savoir
