In [None]:
# Batch Prediction and EDA
''' 
This notebook performs batch predictions using the FastAPI model API and then conducts exploratory data analysis (EDA) 
on the predictions.
'''
## Imports and Setup
import os
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from model.data_ingestion import load_json_files

# Define API endpoints
TRAIN_API_URL = "http://localhost:8000/train"
PREDICT_API_URL = "http://localhost:8000/predict"

# Define data directories
TRAIN_DATA_DIR = "data/cs-train"
PREDICTION_DATA_DIR = "data/cs-production"
OUTPUT_PREDICTION_FILE = "batch_predictions.csv"


## Train models

In [None]:
# Function to send training request to the API
def train_models():
    data = {
        "data_dir": TRAIN_DATA_DIR,
        "test": True
    }
    response = requests.post(TRAIN_API_URL, json=data)
    if response.status_code == 200:
        print("Training completed successfully.")
    else:
        print(f"Training failed: {response.content}")

# Train the models
train_models()

## Data Ingestion

In [None]:
# Ingest data from JSON files
def ingest_data(data_dir):
    df = load_json_files(data_dir)
    return df

# Ingest prediction data
prediction_data = ingest_data(PREDICTION_DATA_DIR)
prediction_data.head()

## Batch prediction

In [None]:
# Function to send prediction requests to the API
def get_predictions(df):
    results = []
    for _, invoice in df.iterrows():
        date_str = f"{invoice['year']}-{invoice['month']}-{invoice['day']}"
        
        data = {
            "country": invoice['country'],
            "year": invoice['year'],
            "month": invoice['month'],
            "day": invoice['day'],
            "test": True
        }
        response = requests.post(PREDICT_API_URL, json=data)
        if response.status_code == 200:
            prediction = response.json()
            prediction['invoice'] = invoice['invoice']
            prediction['country'] = invoice['country']
            prediction['total_price'] = invoice['total_price']
            prediction['date'] = date_str
            results.append(prediction)
        else:
            print(f"Failed to get prediction for invoice {invoice['invoice']}: {response.content}")
    
    return results

# Get predictions
all_predictions = get_predictions(prediction_data)

# Save the predictions to a DataFrame
predictions_df = pd.DataFrame(all_predictions)
predictions_df.to_csv(OUTPUT_PREDICTION_FILE, index=False)

# Display the predictions DataFrame
predictions_df.head()

In [None]:
# Load predictions data if needed
#predictions_df = pd.read_csv("batch_predictions.csv")

revenue_by_country = predictions_df[['country', 'total_price']].groupby('country').sum().sort_values('total_price', ascending=False)
revenue_by_country = revenue_by_country.rename(columns={'total_price': 'revenue'})
revenue_by_country = revenue_by_country.reset_index()
revenue_by_country.head()

# Display basic statistics
predictions_df.describe()

## EDA

### Distrebution of total price

In [None]:
sns.distplot(predictions_df['total_price'], bins=10)


### Revenue by country

In [None]:
plt.figure(figsize=(12, 6))
predictions_df.groupby('country')['y_pred'].mean().plot(kind='bar')
plt.title('Average Predictions by Country')
plt.xlabel('Country')
plt.ylabel('Average Prediction')
plt.show()


In [None]:
sns.catplot(x='country', y='revenue', data=revenue_by_country[:5], kind='bar')
plt.show()

### Predictions over time

In [None]:
# Plot predictions over time
predictions_df['date'] = pd.to_datetime(predictions_df['date'])
plt.figure(figsize=(12, 6))
predictions_df.groupby('date')['y_pred'].mean().plot()
plt.title('Predictions Over Time')
plt.xlabel('Date')
plt.ylabel('Average Prediction')
plt.show()

### Correlation Analysis

In [None]:
# Correlation Analysis
correlation_matrix = predictions_df[['country', 'total_price','y_pred']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Convert Notebook to HTML Report

In [None]:
# Convert the notebook to an HTML report
!jupyter nbconvert batch_prediction_and_eda.ipynb --to slides --no-input --post serve