In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq groq==0.13.0 --progress-bar off
!pip install -qqq python-dotenv==1.0.1 --progress-bar off

In [2]:
!pip install fpdf -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone


In [3]:
import os
from groq import Groq
import pandas as pd

In [4]:
Groq_API_key= "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"
client= Groq(api_key=Groq_API_key)


In [5]:
MISSING_VALUE_INDICATORS = [
    "?", "NA", "N/A", "NaN", "NULL", "", "Unknown", "unknown", "missing", "Missing"
]


# Function to load dataset with flexible missing value handling
def load_dataset(file_path):
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.json'):
            df = pd.read_json(file_path)
            df.replace(MISSING_VALUE_INDICATORS, pd.NA, inplace=True)
        else:
            raise ValueError("Unsupported file format. Please upload a CSV, Excel, or JSON file.")

        if df.empty:
            raise ValueError("The dataset is empty.")

        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None


In [6]:
def generate_prompt_with_dataset(df):
    # Basic dataset info
    dataset_info = df.describe().to_string()
    dataset_head = df.head().to_string()
    missing_values = df.isnull().sum().to_string()
    column_types = df.dtypes.to_string()

    '''print("Dataset Info:")
    print(dataset_info)

    print("\nDataset Head:")
    print(dataset_head)

    print("\nMissing Values:")
    print(missing_values)

    print("\nColumn Data Types:")
    print(column_types)'''

    prompt = f"""
  Given the following dataset, describe the entire data science workflow that can be applied to it. Provide data-specific suggestions** with detailed reasoning for each step. Cover the following aspects in detail:

  Dataset Overview:
  {dataset_head}

  Dataset Summary:
  {dataset_info}

  Missing Values:
  {missing_values}

  Column Data Types:
  {column_types}

  1. Data Preprocessing & Cleaning
      - Identify specific columns with missing values and suggest appropriate handling techniques for each. For example:
          - For numeric columns (e.g., "normalized-losses"), suggest techniques like mean/median imputation or dropping rows, and explain why the chosen technique is suitable.
          - For categorical columns (e.g., "num-of-doors"), suggest techniques like mode imputation or creating a "missing" category, and explain why the chosen technique is suitable.
      - Identify and handle duplicate records, if any.
      - Suggest techniques for handling inconsistent data (e.g., standardizing units, correcting typos).

  2. Exploratory Data Analysis (EDA)
      - For each column, suggest specific EDA techniques and visualizations. For example:
          - For numeric columns (e.g., "horsepower"), suggest histograms, box plots, or scatter plots, and explain why these are suitable.
          - For categorical columns (e.g., "fuel-type"), suggest bar charts, count plots, or pie charts, and explain why these are suitable.
      - Identify relationships between columns and suggest techniques like correlation matrices or pair plots.
      - Suggest dimensionality reduction techniques (e.g., PCA) if applicable, and explain why.

  3. Machine Learning Algorithm Selection
      - Based on the dataset's structure and target variable (if any), suggest specific machine learning algorithms. there could be multiple target variables and multiple ways to use Machine learning on this dataset. For example:
          - If the task is classification (e.g., predicting "fuel-type"), suggest algorithms like logistic regression, decision trees, or random forests, and explain why they are suitable.
          - If the task is regression (e.g., predicting "price"), suggest algorithms like linear regression, decision trees, or gradient boosting, and explain why they are suitable.
      - Compare supervised vs. unsupervised approaches and suggest which is more appropriate for this dataset.
      - Discuss model evaluation metrics (e.g., accuracy, precision, recall, RMSE, F1-score) and explain which metrics are most relevant for this dataset.

  4. Model Optimization & Feature Engineering
      - Suggest specific feature engineering techniques for this dataset. For example:
          - For numeric columns, suggest applicable techniques for example, normalization, standardization, or log transformation, and explain why.
          - For categorical columns, suggest techniques like one-hot encoding, label encoding, or target encoding, and explain why.
      - Suggest hyperparameter tuning techniques (e.g., GridSearchCV, RandomizedSearchCV) and explain how they can improve model performance.
      - Discuss the benefits of ensemble learning (e.g., bagging, boosting, stacking) for this dataset.

  5. Deployment & Real-World Considerations
      - Suggest how models trained on this dataset can be deployed in real-world applications (e.g., APIs, cloud services, edge devices).
      - Discuss strategies for handling data drift, model monitoring, and retraining to maintain model performance over time.

  Provide detailed reasoning for each suggestion, explaining why it is suitable for this specific dataset. Use examples from the dataset (e.g., column names, data types) to make your recommendations as specific as possible.
  """
    return prompt


In [7]:
from fpdf import FPDF
import re

class PDF(FPDF):
    def header(self):
        """Set the title in the header of the PDF"""
        self.set_font("Arial", "B", 14)
        self.cell(200, 10, "Data Science Workflow Report", ln=True, align="C")
        self.ln(10)

    def footer(self):
        """Page footer with page number"""
        self.set_y(-15)
        self.set_font("Arial", "I", 10)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")

def clean_text(text):
    """Replace Unicode characters and remove Markdown-style formatting"""
    replacements = {
        "•": "-",  # Bullet points
        "–": "-",  # En dash
        "—": "-",  # Em dash
        "“": '"',  # Left quote
        "”": '"',  # Right quote
        "‘": "'",  # Left single quote
        "’": "'",  # Right single quote
    }
    for key, value in replacements.items():
        text = text.replace(key, value)

    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)

    return text

def markdown_to_pdf(text, output_file):
    pdf = PDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    text = clean_text(text)
    lines = text.split("\n")

    for line in lines:
        line = line.strip()

        if not line:
            pdf.ln(5)
            continue

        # formatting section headings
        if re.match(r"^[A-Z][a-zA-Z\s]+$", line):
            pdf.set_font("Arial", "B", 12)
            pdf.cell(0, 10, line, ln=True)
            pdf.set_font("Arial", size=12)
            pdf.ln(2)
            continue

        # Detecting bullet points (`* ` or `- `) and formatting properly
        bullet_match = re.match(r"^(\*|-)\s+(.*)", line)
        if bullet_match:
            pdf.cell(5)  # Indentation for bullets
            pdf.set_font("Arial", "B", 12)  # Bold for bullet label
            pdf.cell(5, 10, "-", ln=False)  # Use ASCII-compatible bullet
            pdf.set_font("Arial", size=12)  # Normal text for content
            pdf.multi_cell(0, 8, f" {bullet_match.group(2)}")
            continue

        pdf.multi_cell(0, 8, line)

    # Save the PDF
    pdf.output(output_file, "F")
    print(f"PDF saved as {output_file}")




In [9]:
from fpdf import FPDF

def main():
    file_path = "Automobile_data.csv"
    df = load_dataset(file_path)

    if df is not None:
        # Generating the prompt
        prompt = generate_prompt_with_dataset(df)

        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama3-70b-8192"
        )

        llm_output = chat_completion.choices[0].message.content

        #saving the response to pdf
        markdown_to_pdf(llm_output, "formatted_llm_output.pdf")

# main function
if __name__ == "__main__":
    main()

PDF saved as formatted_llm_output.pdf
