In [None]:
# Batch Prediction and EDA
''' 
This notebook performs batch predictions using the FastAPI model API and then conducts exploratory data analysis (EDA) 
on the predictions.
'''
## Imports and Setup
import os
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define API endpoints
TRAIN_API_URL = "http://localhost:8000/train"
PREDICT_API_URL = "http://localhost:8000/predict"


## Train models

In [None]:
# Function to send training request to the API
def train_models():
    data = {
        "data_dir": "data/cs-train",
        "test": False
    }
    response = requests.post(TRAIN_API_URL, json=data)
    if response.status_code == 200:
        print("Training completed successfully.")
    else:
        print(f"Training failed: {response.content}")

# Train the models
train_models()

## Batch prediction

In [None]:
DATA_DIR = os.path.join("data","cs-production")

# Function to check if a date is within the training data range
def is_date_in_range(date_str, start_date, end_date):
    date = pd.to_datetime(date_str)
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    return start <= date <= end

# Function to send prediction requests to the API
def get_predictions(file_path):
    with open(file_path, 'r') as f:
        invoices = json.load(f)

    results = []
    for invoice in invoices:
        date_str = f"{invoice['year']}-{invoice['month']}-{invoice['day']}"
        
        data = {
            "country": invoice['country'],
            "year": invoice['year'],
            "month": invoice['month'],
            "day": invoice['day'],
            "test": False
        }
        response = requests.post(PREDICT_API_URL, json=data)
        if response.status_code == 200:
            prediction = response.json()
            prediction['invoice'] = invoice['invoice']
            prediction['country'] = invoice['country']
            prediction['total_price'] = invoice['total_price']
            prediction['date'] = date_str
            results.append(prediction)
        else:
            print(f"Failed to get prediction for invoice {invoice['invoice']}: {response.content}")
    
    return results

# Path to the prediction data folder
data_dir = "data/cs-production"

# Collect all JSON files in the production data directory
json_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.json')]

# Get predictions for all files
all_predictions = []
for file in json_files:
    predictions = get_predictions(file)
    all_predictions.extend(predictions)

# Save the predictions to a DataFrame
predictions_df = pd.DataFrame(all_predictions)
predictions_df.to_csv("batch_predictions.csv", index=False)

# Display the predictions DataFrame
predictions_df.head()

## EDA

In [None]:
# Load predictions data
predictions_df = pd.read_csv("batch_predictions.csv")

revenue_by_country = predictions_df[['country', 'total_price']].groupby('country').sum().sort_values('price', ascending=False)
revenue_by_country = revenue_by_country.rename(columns={'price': 'revenue'})
revenue_by_country = revenue_by_country.reset_index()
revenue_by_country.head()

# Display basic statistics
predictions_df.describe()


### Distrebution of price

In [None]:
sns.distplot(predictions_df['price'], bins=10)


### Revenue by country

In [None]:
sns.catplot(x='country', y='revenue', data=revenue_by_country[:5], kind='bar')
plt.show()

### Predictions by country

In [None]:
plt.figure(figsize=(12, 6))
predictions_df.groupby('country')['y_pred'].mean().plot(kind='bar')
plt.title('Average Predictions by Country')
plt.xlabel('Country')
plt.ylabel('Average Prediction')
plt.show()


### Predictions over time

In [None]:
# Plot predictions over time
predictions_df['date'] = pd.to_datetime(predictions_df['date'])
plt.figure(figsize=(12, 6))
predictions_df.groupby('date')['y_pred'].mean().plot()
plt.title('Predictions Over Time')
plt.xlabel('Date')
plt.ylabel('Average Prediction')
plt.show()

### Distribution of Predictions


In [None]:
#Distribution of Predictions
plt.figure(figsize=(12, 6))
sns.histplot(predictions_df['y_pred'], bins=30, kde=True)
plt.title('Distribution of Predictions')
plt.xlabel('Prediction')
plt.ylabel('Frequency')
plt.show()

### Correlation Analysis

In [None]:
# Correlation Analysis
correlation_matrix = predictions_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Convert Notebook to HTML Report

In [None]:
# Convert the notebook to an HTML report
!jupyter nbconvert batch_prediction_and_eda.ipynb --to slides --no-input --post serve