In [40]:
# Installing required dependencies 
!pip install -Uqqq pip --progress-bar off
!pip install -qqq groq==0.13.0 --progress-bar off
!pip install -qqq python-dotenv==1.0.1 --progress-bar off
!pip install -qqq fpdf pandas matplotlib seaborn requests --progress-bar off

 

4729.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4735.95s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4741.76s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4747.55s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [41]:
# Importing  Libraries
 
import os
import re
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from groq import Groq
from fpdf import FPDF

In [42]:
 # Initializing Groq API
 
Groq_API_key = "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"  
client = Groq(api_key=Groq_API_key)

MISSING_VALUE_INDICATORS = ["?", "NA", "N/A", "NaN", "NULL", "", "Unknown", "missing"]

In [43]:
 #Loading Dataset with Missing Value Detection
 
dataset_path = "Automobile_data.csv"

In [50]:
 
df = pd.read_csv("Automobile_data.csv", na_values=MISSING_VALUE_INDICATORS)

print(" Dataset Loaded Successfully!")
print("\nDataset Overview:")
print(df.info())
print("\nMissing Values:\n", df.isnull().sum())


 Dataset Loaded Successfully!

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 no

In [48]:
#  Prompting
def generate_ai_prompt(df):
    missing_values = df.isnull().sum().to_string()
    column_types = df.dtypes.to_string()

    prompt = f""" 
    Analyze the dataset and provide **recommendations** for the data science workflow.

    **Dataset Summary**
    - Column Data Types: {column_types}
    - Missing Values Summary: {missing_values}

    1 **Handling Missing Values**
       - Should we drop rows/columns?
       - Should we fill missing values with mean, median, or mode?
       - Should we create a 'Missing' category?

    2 **Recommended Visualizations** (DO NOT GENERATE, JUST SUGGEST)
       - Suggest histograms, scatter plots, box plots, or correlation heatmaps based on dataset type.
       - Explain why each visualization is relevant.

    3 **Machine Learning Model Recommendations**
       - Is this dataset suitable for classification, regression, or clustering?
       - Suggest appropriate ML models (e.g., Logistic Regression, Decision Trees, Random Forest, K-Means, etc.).
       - Justify why each model is suitable.

    4 **Model Evaluation Techniques**
       - Identify which **evaluation metrics** are best suited for this dataset.
       - If the dataset is **classification**, recommend: Accuracy, Precision, Recall, F1-score, AUC-ROC, and explain why.
       - If the dataset is **regression**, recommend: RMSE, MAE, R²-score, and explain why.
       - If the dataset is **clustering**, recommend: Silhouette Score, Davies-Bouldin Index, Adjusted Rand Index, and explain why.
    """
    return prompt

# Call LLaMA 3 API 
ai_prompt = generate_ai_prompt(df)

try:
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": ai_prompt}],
        model="llama3-70b-8192"
    )

    # Ensuring response is valid
    if not chat_completion.choices or not chat_completion.choices[0].message.content:
        print("\n ERROR: AI Response is empty!")
        ai_response = "No AI response received. Please check your API key or try again."
    else:
        ai_response = chat_completion.choices[0].message.content
        print("\n AI-Powered Data Science Recommendations:\n", ai_response)

except Exception as e:
    print("\n ERROR: API call failed!", str(e))
    ai_response = "No AI response received due to an error."

# Ensure response is saved for later steps
with open("ai_response.txt", "w", encoding="utf-8") as f:
    f.write(ai_response)



 AI-Powered Data Science Recommendations:
 Based on the dataset summary, I provide the following recommendations for the data science workflow:

**1. Handling Missing Values**

* For columns with a small number of missing values (less than 5%), I recommend filling them with the median or mean for numerical columns (e.g., `bore`, `stroke`, `horsepower`, `peak-rpm`, and `price`) and the mode for categorical columns (e.g., `num-of-doors`).
* For columns with a larger number of missing values (e.g., `normalized-losses`), I recommend creating a 'Missing' category, as this might be a relevant feature in the analysis.
* Dropping rows or columns with missing values might not be the best approach, as it could lead to loss of valuable information and biased results.

**2. Recommended Visualizations**

* Histograms for numerical columns (e.g., `engine-size`, `horsepower`, `city-mpg`, `highway-mpg`, and `price`) to understand the distribution of values.
* Scatter plots for numerical columns (e.g.

In [49]:
import os
from fpdf import FPDF

# Ensuring model response is properly loaded from file
if os.path.exists("ai_response.txt"):
    with open("ai_response.txt", "r", encoding="utf-8") as f:
        ai_response = f.read()
else:
    ai_response = " No AI response found. Please check Step 4."

class PDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 16)
        self.cell(200, 10, "Data Science Recommendations", ln=True, align="C")
        self.ln(10)
        self.set_font("Arial", "I", 12)
        self.cell(200, 10, "A comprehensive analysis and recommendations report", ln=True, align="C")
        self.ln(20)

    def footer(self):
        self.set_y(-15)
        self.set_font("Arial", "I", 10)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")

def generate_pdf_report(text, filename):
    pdf = PDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
 
    
    pdf.ln(10)   

    # Inserting AI Response
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "Your Report", ln=True)
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)

    # Save the PDF
    pdf.output(filename)
    print(f"\n AI-Generated Report saved as: {filename}")

# Generate the PDF Report
generate_pdf_report(ai_response, "Report.pdf")



 AI-Generated Report saved as: Report.pdf
