In [154]:
# Install the groq package for querying the data
%pip install -qqq groq==0.13.0
# Install the fpdf package for creating the PDF report
%pip install fpdf -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [155]:
# Import Groq and Pandas packages
import os
import httpx
from groq import Groq
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [156]:
# Define all constants to be used
GROQ_API_KEY = "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"

# Define the dataset name
MISSING_VALUE_INDICATORS = [
    "?", "NA", "N/A", "NaN", "NULL", "", "Unknown", "unknown", "missing", "Missing"
]

# Define the model name to be used in chat completions
LLM_MODEL_NAME = "llama3-70b-8192"

# We can define a threshold for when to drop the values and when not 
#   For example: 20% is too much of a loss, so we can suggest replacing with the mean
#   Anything between 0 and 5% is not too much of a loss and can be dropped.
PREPROCESSING_THRES_HIGH_MISS = 20  # Replace by mean if >= 20% missing values
PREPROCESSING_THRES_LOW_VAR = 5 # Drop if less or eq 5% missing values

In [157]:
# Create a groq client
grop_client = Groq(
    api_key=GROQ_API_KEY, 
    http_client=httpx.Client(verify=False)  # Disable SSL verification
)

## Separate Functions

### Introduction
This section introduces all functions that should be distributed throughout our codebase for ease of use and readability.

### Functions
1. `load_dataset_from_file`: Load a dataset from a file.
2. `generate_prompt_with_dataset`: Generate a prompt with a dataset.
3. `preprocess_data`: Preprocess data.
4. `create_statistical_summary`: Create a statistical summary.
5. `eda`: Perform exploratory data analysis.
6. `ml_suggestions`: Provide machine learning suggestions.
7. `feature_egr`: Perform feature engineering.
8. `model_deployment`: Analyze possible model deployments for data.
9. `conclusion`: Provide a conclusion.

10. `generate_report`: Generate a report by calling all the aforementioned functions.

In [158]:
def load_dataset_from_file(file_path: str) -> pd.DataFrame:
    """
    Load a dataset from a file and return a Pandas DataFrame.

    Args:
    - file_path (str): The path to the file.

    Returns:
    - pd.DataFrame: The loaded dataset as a Pandas DataFrame.
    """

    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, na_values=MISSING_VALUE_INDICATORS)
        elif file_path.endswith('.json'):
            df = pd.read_json(file_path)
            df.replace(MISSING_VALUE_INDICATORS, pd.NA, inplace=True)
        else:
            raise ValueError(
                "Unsupported file format. Please upload a CSV, Excel, or JSON file.")

        if df.empty:
            raise ValueError("The dataset is empty.")

        return df

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

In [159]:
def generate_prompt_with_dataset(df: pd.DataFrame, goal: str) -> str:
    """
    Generate a prompt for the user to complete based on the dataset provided.

    Args:
    - df (pd.DataFrame): The dataset to be described.
    - goal (str): The goal or task that the user should complete.

    Returns:
    - str: The prompt to be displayed
    """

    # Extracting basic dataset info
    dataset_describe = df.describe().to_string()
    dataset_head = df.head().to_string()
    missing_values = ((df.isnull().sum())[df.isnull().sum()!=0]).to_string()
    column_types = df.dtypes.to_string()
    
    # Format the prompt
    prompt = f"""
      Dataset Overview:
      ################
      {dataset_head}
      ################

      Dataset Statistics Summary:
      ################
      {dataset_describe}
      ################

      Missing Values:
      ################
      {missing_values}
      ################

      Column Data Types:
      ################
      {column_types}
      ################

      Your Task:
      ################
      {goal}
      ################

    """

    return prompt

In [160]:
system_prompt = {"role": "system", "content": """
            - Make this section of the report as structured and concise as possible.
            - Provide detailed reasoning for each suggestion, explaining why it is suitable for this specific dataset.
            - Refer to the dataset's structure and target variable in your suggestions.
            - Each main title should be formatted as a markdown headings appropriately.
            - Give this section an appropriate and descriptive title.
            - Do not summarize the dataset again unless it is about changing the dataformat to help run the EDA technique.
            - Assume that there are complementary sections that were created before this prompt to help the reader understand the context of 
            the dataset and there is no need to reiterate the same information.      
            - Have a consistent structure for the report: heading 1 first, then heading 2, then numbers 1,2,3, etc. then bullet points. 
                 """}

In [161]:
def preprocess_data(df: pd.DataFrame) -> str:
    """
    Preprocess the data by handling missing values and removing duplicates.

    Args:
    - df (pd.DataFrame): The dataset to be preprocessed.

    Returns:
    - str: The preprocessed data summary.
    """

    #data_before_cleaning = df.head(3).to_markdown(index=False)
    
    # Add unique values for categorical columns
    unique_values = df.nunique()

    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    missing_values_pct = (missing_values / df.shape[0] * 100).round(2).astype(str) + "%"

    duplicate_count = df.duplicated().sum()
    
    #- The goal is to generate a data cleaning and preprocessing summary report.
    #- Provide an overview of the dataset before cleaning:
    # + system_prompt["content"]

    data_preproc_clean = f"""
    - You must first display the head of the dataset in your response in table format including all the rows.
    - The dataset initially contains {df.shape[1]} columns and {df.shape[0]} rows.
    - {len(missing_values)} columns with missing values such that {missing_values_pct}.
    - {duplicate_count} duplicate rows found.
    - Unique values for each column: {unique_values.to_string()}
    - This section of the report is entitled Data Preprocessing & Cleaning.
    - Compare the dataset before and after cleaning.
    - Mention how the dataset was affected after cleaning.
    - Make this section of the report as structured and concise as possible.
    - Provide detailed reasoning for each suggestion, explaining why it is suitable for this specific dataset.
    - Each main title should be formatted as a markdown headings appropriately.
    - Have a consistent structure for the report: heading 1 first, then heading 2, then numbers 1,2,3, etc. then bullet points. 
"""

    prompt = generate_prompt_with_dataset(df, goal=data_preproc_clean)

    chat_completion = grop_client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model=LLM_MODEL_NAME
    )

    summary = chat_completion.choices[0].message.content
    #print(summary)

    return summary

In [162]:
def create_statistical_summary(df: pd.DataFrame) -> str:
    """
    Generate a summary of the statistical analysis of the dataset.

    Args:
    - df (pd.DataFrame): The dataset to be analyzed.

    Returns:
    - str: The summary of the statistical analysis.
    """

    summary = """## **Statistical Analysis**\n""" + df.describe().to_markdown()

    prompt = generate_prompt_with_dataset(
        df, goal=f"""
        - You must first display all the rows {df.describe().to_markdown()} in table format.
        - Summarize the following findings under the Statistical Analysis section:
        - Make this section of the report as structured and concise as possible.
        - Provide detailed reasoning for each suggestion, explaining why it is suitable for this specific dataset.
        - Each main title should be formatted as a markdown headings appropriately.
        - Interpret the statistical analysis results and suggest next steps and say something about the variables distributions to help decide on what to do next using the {df.describe().to_markdown()}
        - Have a consistent structure for the report: heading 1 first, then heading 2, then numbers 1,2,3, etc. then bullet points. 
        """)

    chat_completion = grop_client.chat.completions.create(
        messages=[system_prompt,
            #{"role": "system", "content": "Make sure you put the title of the section as a markdown heading. The title of this section is '# Statistical Analysis'."}, 
                  {"role": "user", "content": prompt}],
        model=LLM_MODEL_NAME
    )

    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [163]:
def eda(df: pd.DataFrame) -> str:
    """
    Generate a summary of the Exploratory Data Analysis (EDA) suggestions for the dataset.

    Args:
    - df (pd.DataFrame): The dataset to be analyzed.

    Returns:
    - str: The summary of the EDA suggestions.
    """
    prompt = generate_prompt_with_dataset(df, goal="""generate Exploratory Data Analysis""")
    
    chat_completion = grop_client.chat.completions.create(
        messages=[system_prompt, {"role": "user", "content": prompt}],
        model=LLM_MODEL_NAME
    )
    
    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [164]:
def ml_suggestions(df) -> str:
    """
    Generate a summary of the Machine Learning Algorithm Selection suggestions for the dataset.

    Args:
    - df (pd.DataFrame): The dataset to be analyzed.
    
    Returns:
    - str: The summary of the Machine Learning Algorithm Selection suggestions.
    """

    prompt = generate_prompt_with_dataset(df, goal="""generate Supervised and Unsupervised Machine Learning suggestions based on the dataset provided.""")

    chat_completion = grop_client.chat.completions.create(
            messages=[system_prompt, {"role": "user", "content": prompt}],
            model=LLM_MODEL_NAME
        )

    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [165]:
def feature_egr(df: pd.DataFrame) -> str:
    """
    Generate a summary of the Feature Engineering suggestions for the dataset.

    Args:
    - df (pd.DataFrame): The dataset to be analyzed.

    Returns:
    - str: The summary of the Feature Engineering suggestions.
    """

    prompt = generate_prompt_with_dataset(df, goal="""generate Feature Engineering suggestions based on the dataset provided.""")

    chat_completion = grop_client.chat.completions.create(
        messages=[system_prompt, {"role": "user", "content": prompt}],
        model=LLM_MODEL_NAME
    )

    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [166]:
def model_deployment(df: pd.DataFrame) -> str:
    """
    Generate a summary of the Model Deployment Strategies and Handling Data Drift suggestions for the dataset.

    Args:
    - df (pd.DataFrame): The dataset to be analyzed.

    Returns:
    - str: The summary of the Model Deployment Strategies and Handling Data Drift suggestions.
    """
    
    prompt = generate_prompt_with_dataset(df, goal =  """generate Model Deployment Strategies and Handling Data Drift suggestions based on the dataset provided.""")
    
    chat_completion = grop_client.chat.completions.create(
            messages=[system_prompt, {"role": "user", "content": prompt}],
            model=LLM_MODEL_NAME
        )
    
    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [167]:
def conclusion(df: pd.DataFrame, report: str) -> str:
    """
    Generate a conclusion for the data science workflow report.

    Args:
    - df (pd.DataFrame): The dataset used in the report.
    - report (str): The report content.

    Returns:
    - str: The conclusion of the data science workflow report.
    """

    data_conclusion = generate_prompt_with_dataset(df, goal =  f"""
        - This is the last section of the report and the goal from this part of the report is to generate a conclusion based on the dataset provided and the data report file.
        - Here is the report: 
            ############
            {report}
            ############
        - Make it as structured and concise as possible.
        - Give this section an appropriate and descriptive title.
        - Consider that there are complementary sections that were created before this prompt to help the reader understand the context of the dataset and there is no need to reiterate the same information.
        - Name this section Conclusion.
    """)

    chat_completion = grop_client.chat.completions.create(
            messages=[system_prompt, {"role": "user", "content": data_conclusion}],
            model=LLM_MODEL_NAME
        )
    
    summary = chat_completion.choices[0].message.content
    #print(summary)
    return summary

In [168]:
def generate_report(df: pd.DataFrame, file_name: str = "automated_report.md") -> str:
    """
    Generate an AI report for a given dataset and write it to a markdown file.

    Args:
    df (pd.DataFrame): The input dataset.

    Returns:
    str: The absolute path of the generated markdown file.
    """

    functions = [preprocess_data, create_statistical_summary, eda, ml_suggestions, feature_egr, model_deployment]

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda func: func(df), functions))

    report = "\n\n".join(results)
    report += "\n\n"+conclusion(df, report)

    # Write the report to a file
    with open(file_name, "w") as f:
        f.write(report)
    
    # Return file's absolute path
    return os.path.abspath(file_name)

In [169]:
file_path = "../../Data Files/Automobile/Automobile_data.csv"
df = load_dataset_from_file(file_path)
generate_report(df, "Auto.md")

'/Users/ghizlanerehioui/Downloads/Capstone/Code_files/Ghizlane/Auto.md'

In [170]:
file_path = "../../Data Files/Customer Experience/Car_Reviews_Database.csv"
df = load_dataset_from_file(file_path)
df.head()
generate_report(df, "Reviews.md")

'/Users/ghizlanerehioui/Downloads/Capstone/Code_files/Ghizlane/Reviews.md'