In [45]:
GROQ_API_KEY = ###

# Cleaned-Up Code

In [46]:
# %%
# Install necessary packages (uncomment if needed in a notebook)
# %pip install -qqq groq==0.13.0
# %pip install fpdf -q

import os
import httpx
import pandas as pd
from groq import Groq
from concurrent.futures import ThreadPoolExecutor
from fpdf import FPDF

MISSING_VALUE_INDICATORS = [
    "?", "NA", "N/A", "NaN", "NULL", "", "Unknown", "unknown", "missing", "Missing"
]
LLM_MODEL_NAME = "llama3-70b-8192"

In [47]:
# Create Groq client (disable SSL verify only for demonstration; not recommended in production)
groq_client = Groq(
    api_key=GROQ_API_KEY,
    http_client=httpx.Client(verify=False)
)

In [48]:
# Global system prompt for consistent structure
SYSTEM_PROMPT = {
    "role": "system",
    "content": """
    You are an AI assistant helping generate structured and concise data science sections.
    Adhere to these rules:
    1. Use headings: # for main sections and ## or ### for subsections.
    2. Keep each section concise and to the point.
    3. Avoid repeating dataset overviews or the word 'report' in each section.
    4. Do not add lines of ========= or unnecessary lines before or after headings.
    """
}

In [49]:
def load_dataset_from_file(file_path: str) -> pd.DataFrame:
    """
    Load a dataset from a file into a DataFrame. Accepts CSV, Excel, or JSON.
    """
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.json'):
            df = pd.read_json(file_path)
            df.replace(MISSING_VALUE_INDICATORS, pd.NA, inplace=True)
        else:
            raise ValueError("Unsupported file format. Use CSV, Excel, or JSON.")

        if df.empty:
            raise ValueError("The dataset is empty.")

        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None


In [50]:
def generate_user_prompt_with_dataset(df: pd.DataFrame, goal: str) -> str:
    """
    Builds a user prompt with minimal overview plus a final goal or request.
    """
    # Show a short snippet, not an entire repeated "overview" for each section
    # But keep enough context in case it's needed:
    dataset_head = df.head().to_markdown()  # Using markdown to show columns more clearly
    dataset_describe = df.describe().to_markdown()
    missing_vals = df.isnull().sum()
    missing_values_summary = missing_vals[missing_vals != 0].to_string()
    column_types = df.dtypes.to_string()

    prompt = f"""
    Dataset Head (first 5 rows):
    {dataset_head}

    Dataset Statistics (describe):
    {dataset_describe}

    Missing Values:
    {missing_values_summary if missing_values_summary.strip() else 'No missing values'}

    Column Data Types:
    {column_types}

    Task:
    {goal}
    """
    return prompt.strip()

In [51]:
def preprocess_data(df: pd.DataFrame) -> str:
    """
    Generates the Data Preprocessing & Cleaning section.
    """
    # Summaries
    missing_values = df.isnull().sum()
    missing_values_pct = (missing_values / df.shape[0] * 100).round(2).astype(str) + "%"
    duplicate_count = df.duplicated().sum()
    unique_values = df.nunique()

    # Build the user goal (with heading)
    user_goal = f"""
    # **Data Preprocessing & Cleaning**
    1. Show the first five rows of the dataset with all columns in a table-like format (no extra introduction text).
    2. Summarize missing values and duplicates (Missing values: {missing_values_pct.to_string()}, Duplicates: {duplicate_count}).
    3. Summarize unique values per column: {unique_values.to_string()}.
    4. Explain any cleaning steps that might be needed and how they affect the dataset.
    """

    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content


In [52]:
def create_statistical_summary(df: pd.DataFrame) -> str:
    """
    Generates the Statistical Analysis section.
    """
    user_goal = """
    # **Statistical Analysis**
    1. Show the numeric summary table (df.describe()) in a markdown/table format.
    2. Interpret the mean, std, min, max, etc.
    3. Suggest next steps based on distributions.
    """
    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [53]:
def eda(df: pd.DataFrame) -> str:
    """
    Generates the Exploratory Data Analysis section.
    """
    user_goal = "# **Exploratory Data Analysis**\nSuggest EDA techniques (visualization ideas, correlation checks, outlier detection)."
    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [54]:
def ml_suggestions(df: pd.DataFrame) -> str:
    """
    Generates ML Algorithm Selection suggestions.
    """
    user_goal = "# **Machine Learning Suggestions**\nDiscuss supervised and unsupervised methods relevant to this dataset."
    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [55]:
def feature_egr(df: pd.DataFrame) -> str:
    """
    Generates Feature Engineering suggestions.
    """
    user_goal = "# **Feature Engineering**\nDiscuss feature creation, transformation, and selection approaches."
    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [56]:
def model_deployment(df: pd.DataFrame) -> str:
    """
    Generates Model Deployment & Data Drift suggestions.
    """
    user_goal = "# **Model Deployment & Data Drift**\nPropose strategies for deploying models and handling data drift."
    user_prompt = generate_user_prompt_with_dataset(df, user_goal)
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [57]:
def conclusion(df: pd.DataFrame, combined: str) -> str:
    """
    Generates the Conclusion section.
    """
    user_goal = f"""
    # **Conclusion**
    Provide a concluding section referring to the overall insights. Keep it succinct.
    """
    user_prompt = generate_user_prompt_with_dataset(df, user_goal + f"\nSections combined:\n{combined}")
    response = groq_client.chat.completions.create(
        messages=[SYSTEM_PROMPT, 
                  {"role": "user", "content": user_prompt}],
        model=LLM_MODEL_NAME
    )
    return response.choices[0].message.content

In [58]:
def generate_report(df: pd.DataFrame, file_name: str = "automated_report.md") -> str:
    """
    Generates a multi-section AI-written document for a dataset and saves to a file.
    """
    steps = [
        preprocess_data,
        create_statistical_summary,
        eda,
        ml_suggestions,
        feature_egr,
        model_deployment
    ]

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda func: func(df), steps))

    combined_report = "\n\n".join(results)
    final_section = conclusion(df, combined_report)
    full_report = combined_report + "\n\n" + final_section

    with open(file_name, "w", encoding="utf-8") as f:
        f.write(full_report)

    return os.path.abspath(file_name)

In [66]:
file_path = "../../Data Files/Automobile/Automobile_data.csv"
df = load_dataset_from_file(file_path)
if df is not None:
    report_path = generate_report(df, f"{file_path.split('/')[3]}_2_Clean_Up.md")
    print(f"Report generated at: {report_path}")
    #generate_pdf_report_from_markdown(report_path, report_path.split('/')[-1][:-2]+"pdf")

Report generated at: /Users/ghizlanerehioui/Downloads/Capstone/Code_files/Ghizlane/Automobile_2_Clean_Up.md


In [37]:
file_path = "../../Data Files/Customer Experience/Car_Reviews_Database.csv"
df = load_dataset_from_file(file_path)
if df is not None:
    report_path = generate_report(df, f"{file_path.split('/')[3]}_2_Clean_Up.md")
    print(f"Report generated at: {report_path}")
    #generate_pdf_report_from_markdown(report_path, report_path.split('/')[-1][:-2]+"pdf")

Report generated at: /Users/ghizlanerehioui/Downloads/Capstone/Code_files/Ghizlane/Customer Experience_2_Clean_Up.md


# Change md to pdf

In [63]:
# Function that will generate a PDF report from the markdown file
def generate_pdf_report_from_markdown(md_file: str, pdf_file: str, ttf_font_path: str) -> None:
    """
    Reads a Markdown file and generates a PDF using FPDF with an embedded TrueType font.
    This allows printing Unicode characters like the bullet (•).

    :param md_file: Path to the input Markdown file.
    :param pdf_file: Path where the output PDF will be saved.
    :param ttf_font_path: Path to a TrueType font file with broad Unicode support.
    """

    # Create a new PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)

    # Register/Embed the TTF font. 
    #    - "DejaVu" is just a name; you can pick any label you want.
    #    - Make sure ttf_font_path is the full path to your TTF file, e.g. "fonts/DejaVuSansCondensed.ttf"
    pdf.add_font("DejaVu", fname=ttf_font_path, uni=True)

    # Helper to write lines with the embedded font
    def write_line(text: str, style: str = "", size: int = 12):
        # Set the newly added font
        pdf.set_font("DejaVu", style=style, size=size)
        # multi_cell automatically wraps text
        pdf.multi_cell(0, 7, text)
        # Add extra spacing
        pdf.ln(1)

    with open(md_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line.strip():
                continue

            # Simple Markdown detection logic
            if line.startswith("# "):
                content = line.replace("# ", "").strip()
                write_line(content, style="B", size=16)
            elif line.startswith("## "):
                content = line.replace("## ", "").strip()
                write_line(content, style="B", size=14)
            elif line.startswith("### "):
                content = line.replace("### ", "").strip()
                write_line(content, style="B", size=13)
            elif line.startswith("- ") or line.startswith("* "):
                # Convert to bullet if you want to keep the bullet '•'
                # If your markdown uses '*' or '-' for bullets, you can replace them.
                bullet_content = line.replace("- ", "• ").replace("* ", "• ").strip()
                write_line(bullet_content, size=12)
            else:
                write_line(line, size=12)

    pdf.output(pdf_file)
    print(f"PDF successfully created at: {pdf_file}")

In [None]:
generate_pdf_report_from_markdown(report_path, report_path.split('/')[-1][:-2]+"pdf")

# Additions

In [None]:
# Replicate of this code with pydantic prompting
# Track token usage and speed - so far min 3-5 and up to 6-10 seconds to generate report