In [3]:
import os
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from model.data_ingestion import load_json_files

# API Endpoints
TRAIN_API_URL = "http://localhost:8000/train"
PREDICT_API_URL = "http://localhost:8000/predict"

# Data Directories
TRAIN_DATA_DIR = "data/cs-train"
PREDICTION_DATA_DIR = "data/cs-production"
OUTPUT_PREDICTION_FILE = "predictions.csv"

ModuleNotFoundError: No module named 'model'

In [1]:
def initiate_model_training():
    """
    Send a request to the API to train the models using the provided training data directory.
    """
    payload = {
        "data_dir": TRAINING_DATA_PATH,
        "test": True
    }
    response = requests.post(TRAIN_API_ENDPOINT, json=payload)
    
    if response.ok:
        print("Model training completed successfully.")
    else:
        print(f"Model training failed with status code {response.status_code}: {response.text}")
initiate_model_training()

In [None]:
def load_prediction_data(data_directory):
    """
    Load data from JSON files located in the specified directory.
    
    Parameters:
    - data_directory (str): The directory containing JSON files.
    
    Returns:
    - DataFrame: A DataFrame containing the loaded data.
    """
    return load_json_files(data_directory)


prediction_data_frame = load_prediction_data(PREDICTION_DATA_PATH)
prediction_data_frame.head()

In [None]:
def perform_batch_predictions(data_frame):
    """
    Perform predictions for each record in the provided DataFrame using the prediction API.
    
    Parameters:
    - data_frame (DataFrame): The DataFrame containing data for prediction.
    
    Returns:
    - List[Dict]: A list of dictionaries containing prediction results.
    """
    prediction_results = []
    
    for _, record in data_frame.iterrows():
        payload = {
            "country": record['country'],
            "year": record['year'],
            "month": record['month'],
            "day": record['day'],
            "test": True
        }
        response = requests.post(PREDICT_API_ENDPOINT, json=payload)
        
        if response.ok:
            result = response.json()
            result.update({
                'invoice_id': record['invoice'],
                'country_code': record['country'],
                'total_amount': record['total_price'],
                'transaction_date': f"{record['year']}-{record['month']}-{record['day']}"
            })
            prediction_results.append(result)
        else:
            print(f"Prediction failed for invoice {record['invoice']}: {response.text}")
    
    return prediction_results

In [None]:
redicted_results = perform_batch_predictions(prediction_data_frame)

# Save predictions to CSV
predictions_df = pd.DataFrame(predicted_results)
predictions_df.to_csv(OUTPUT_PREDICTIONS_FILE, index=False)

# Display the first few predictions
predictions_df.head()

## Exploratory Data Analysis (EDA)
# Aggregate revenue by country
country_revenue = predictions_df.groupby('country_code')['total_amount'].sum().reset_index()
country_revenue = country_revenue.rename(columns={'total_amount': 'total_revenue'}).sort_values('total_revenue', ascending=False)
country_revenue.head()

# Basic statistics of the predictions
predictions_df.describe()

# Distribution of predicted total amounts
plt.figure(figsize=(12, 6))
sns.histplot(predictions_df['total_amount'], bins=10, kde=True)
plt.title('Distribution of Total Amounts')
plt.xlabel('Total Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
average_predictions_by_country = predictions_df.groupby('country_code')['y_pred'].mean().sort_values()
average_predictions_by_country.plot(kind='bar')
plt.title('Average Predictions by Country')
plt.xlabel('Country')
plt.ylabel('Average Prediction')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='country_code', y='total_revenue', data=country_revenue.head(5))
plt.title('Top 5 Countries by Total Revenue')
plt.xlabel('Country')
plt.ylabel('Total Revenue')
plt.show()

In [None]:
predictions_df['transaction_date'] = pd.to_datetime(predictions_df['transaction_date'])
plt.figure(figsize=(12, 6))
predictions_df.groupby('transaction_date')['y_pred'].mean().plot()
plt.title('Predictions Over Time')
plt.xlabel('Date')
plt.ylabel('Average Prediction')
plt.show()

In [None]:
correlation_matrix = predictions_df[['total_amount', 'y_pred']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
!jupyter nbconvert --to slides --no-input prediction.ipynb