# Load the processed data model and other libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import plotly.express as px
from hiplot import Experiment

alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [4]:
# Load the cleaned DataFrame
balanced_df = pd.read_pickle('./../../NLP_NLG_Recipe_prediction_generation_app_data/Data_Diagnostics_data/balanced_recipes_dataset.pkl')

In [5]:
balanced_df.head(n=5), balanced_df.shape

(                                         recipe_title  \
 10                                 Braised Anise Beef   
 11                                       Braised Beef   
 12                      Crock Pot Shredded French Dip   
 13                         Creamy Summertime Coleslaw   
 14  Beef Tenderloin Steaks with Seared Mushrooms a...   
 
                                      ingredients_list  \
 10  beef chuck garlic oil dsh pepper salt sherry s...   
 11  scallion sherry sugar garlic salt pepper peanu...   
 12  chuck roast garlic ground mustard seasoning be...   
 13  apple cider vinegar cabbage chives yogurt garl...   
 14  bacon beef tenderloin steaks pepper cremini mu...   
 
                                          instructions  cuisine link source  \
 10  <ol><li>1. Leave beef whole. Crush garlic.</li...  italian  NaN    NaN   
 11  Cut green part of scallion into 2" pieces. Com...  italian  NaN    NaN   
 12  <ol><li>Place beef in crock pot. Add beef brot...  italian

# Visualization & Findings | EDA

In [6]:
# Data: counts for each cuisine category
cuisine_counts = {
    'italian': 1793939,
    'mexican': 808277,
    'chinese': 778904,
    'indian': 114397,
    'mediterranean': 43804,
    'southern_us': 35726,
    'spanish': 5096,
    'japanese': 5027,
    'middle eastern': 1007,
    'vietnamese': 818,
    'greek': 774,
    'french': 38,
    'jamaican': 26,
    'moroccan': 19,
    'brazilian': 19,
    'cajun_creole': 7
}

In [7]:
cuisine_counts_df = pd.DataFrame(list(cuisine_counts.items()), columns=['cuisine', 'count'])

# Define majority and minority classes
majority_classes = ['italian', 'mexican', 'chinese', 'indian']
minority_classes = [
    'mediterranean', 'southern_us', 'spanish', 'japanese',
    'middle eastern', 'vietnamese', 'greek', 'french',
    'jamaican', 'moroccan', 'brazilian', 'cajun_creole'
]

In [8]:
# 1. Count Plot using Seaborn for Majority and Minority Classes
def plot_cuisine_distribution():
    
    plt.figure(figsize=(16, 8))

    # Majority Classes
    plt.subplot(1, 2, 1)
    sns.barplot(data=cuisine_counts_df[cuisine_counts_df['cuisine'].isin(majority_classes)],
                x='cuisine', y='count', palette='viridis')
    plt.title('Majority Cuisine Distribution')
    plt.xticks(rotation=45)
    plt.ylim(0, 2000000)  # Adjust upper limit
    plt.xlabel('Cuisine')
    plt.ylabel('Count')

    # Minority Classes
    plt.subplot(1, 2, 2)
    sns.barplot(data=cuisine_counts_df[cuisine_counts_df['cuisine'].isin(minority_classes)],
                x='cuisine', y='count', palette='viridis')
    plt.title('Minority Cuisine Distribution')
    plt.xticks(rotation=45)
    plt.ylim(0, 200000)  # Adjust upper limit for visibility
    plt.xlabel('Cuisine')
    plt.ylabel('Count')

    plt.tight_layout()
    plt.show()
    
# plot_cuisine_distribution()

In [9]:
balanced_df['num_unique_ingredients'] = balanced_df['ingredients_list'].apply(lambda x: len(set(x.split())))

def plot_unique_ingredients_boxplot():
    plt.figure(figsize=(16, 8))

    # Majority Classes
    plt.subplot(1, 2, 1)
    sns.boxplot(data=balanced_df[balanced_df['cuisine'].isin(majority_classes)],
                x='cuisine', y='num_unique_ingredients', palette='viridis')
    plt.title('Majority Classes: Unique Ingredients')
    plt.xticks(rotation=45)
    plt.ylim(0, balanced_df['num_unique_ingredients'].max() * 1.1)
    plt.xlabel('Cuisine')
    plt.ylabel('Number of Unique Ingredients')

    # Minority Classes
    plt.subplot(1, 2, 2)
    sns.boxplot(data=balanced_df[balanced_df['cuisine'].isin(minority_classes)],
                x='cuisine', y='num_unique_ingredients', palette='viridis')
    plt.title('Minority Classes: Unique Ingredients')
    plt.xticks(rotation=45)
    plt.ylim(0, balanced_df['num_unique_ingredients'].max() * 1.1)
    plt.xlabel('Cuisine')
    plt.ylabel('Number of Unique Ingredients')

    plt.tight_layout()
    plt.show()

# plot_unique_ingredients_boxplot()

In [10]:
def plot_interactive_scatter():
    # Majority Classes
    fig_majority = px.scatter(balanced_df[balanced_df['cuisine'].isin(majority_classes)],
                               x='num_unique_ingredients', y='cuisine',
                               color='cuisine', title='Majority Classes: Unique Ingredients',
                               labels={'num_unique_ingredients': 'Number of Unique Ingredients'},
                               height=400)
    fig_majority.show()

    # Minority Classes
    fig_minority = px.scatter(balanced_df[balanced_df['cuisine'].isin(minority_classes)],
                               x='num_unique_ingredients', y='cuisine',
                               color='cuisine', title='Minority Classes: Unique Ingredients',
                               labels={'num_unique_ingredients': 'Number of Unique Ingredients'},
                               height=400)
    fig_minority.show()
    
# plot_interactive_scatter()

In [11]:
def plot_ingredients_histogram():
    plt.figure(figsize=(16, 8))

    # Majority Classes
    plt.subplot(1, 2, 1)
    sns.histplot(data=balanced_df[balanced_df['cuisine'].isin(majority_classes)],
                 x='num_unique_ingredients', hue='cuisine', multiple='stack', palette='viridis')
    plt.title('Majority Classes: Distribution of Unique Ingredients')
    plt.xlabel('Number of Unique Ingredients')
    plt.ylabel('Frequency')

    # Minority Classes
    plt.subplot(1, 2, 2)
    sns.histplot(data=balanced_df[balanced_df['cuisine'].isin(minority_classes)],
                 x='num_unique_ingredients', hue='cuisine', multiple='stack', palette='viridis')
    plt.title('Minority Classes: Distribution of Unique Ingredients')
    plt.xlabel('Number of Unique Ingredients')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

# plot_ingredients_histogram()

In [12]:
def plot_correlation_heatmap():
    # Pivot the dataset for heatmap preparation
    pivot_df = balanced_df.pivot_table(index='cuisine', values='num_unique_ingredients', aggfunc='mean')

    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_df, annot=True, cmap='coolwarm')
    plt.title('Mean Number of Unique Ingredients per Cuisine')
    plt.ylabel('Cuisine')
    plt.xlabel('Mean Number of Unique Ingredients')
    plt.show()

# plot_correlation_heatmap()