# Further Data Diagnostics of the processes data

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sp
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the cleaned DataFrame
combined_recipes_df = pd.read_pickle('./../Data_model/processed_recipes_data.pkl')

In [3]:
# Convert lists or tuples in 'ingredients_list' and 'recipe_title' to strings (joining by space)
combined_recipes_df['ingredients_list'] = combined_recipes_df['ingredients_list'].apply(lambda x: ' '.join(x) if isinstance(x, (list, tuple)) else str(x))
combined_recipes_df['recipe_title'] = combined_recipes_df['recipe_title'].apply(lambda x: ' '.join(x) if isinstance(x, (list, tuple)) else str(x))

# Features (ingredients_list and recipe_title) and target (cuisine)
X = combined_recipes_df[['ingredients_list', 'recipe_title']]
y = combined_recipes_df['cuisine']

# Initialize TF-IDF vectorizers for both ingredients_list and recipe_title
tfidf_ingredients = TfidfVectorizer(max_features=100)  # Limit to 100 most relevant words
tfidf_title = TfidfVectorizer(max_features=100)         # Limit to 100 most relevant words

# Fit and transform both ingredients_list and recipe_title
X_ingredients = tfidf_ingredients.fit_transform(X['ingredients_list'])
X_title = tfidf_title.fit_transform(X['recipe_title'])

# Combine both vectorized features into one sparse matrix
X_combined = sp.hstack([X_ingredients, X_title])

In [6]:
# Combine original data into a DataFrame
original_data = combined_recipes_df.copy()
original_data['cuisine'] = y

# Separate majority and minority classes
majority_class = original_data[original_data['cuisine'] == original_data['cuisine'].value_counts().idxmax()]
minority_classes = original_data[original_data['cuisine'] != original_data['cuisine'].value_counts().idxmax()]

# Randomly oversample minority classes
minority_upsampled = resample(minority_classes,
                              replace=True,     # Sample with replacement
                              n_samples=majority_class.shape[0],  # Match the number of majority class samples
                              random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
balanced_df = pd.concat([majority_class, minority_upsampled])

# Check the class distribution of the balanced dataset
print(balanced_df['cuisine'].value_counts())

# Prepare the final features and target for the balanced dataset
X_balanced = balanced_df[['ingredients_list', 'recipe_title']]
y_balanced = balanced_df['cuisine']

# Transform the balanced dataset using the same TF-IDF vectorizers
X_ingredients_balanced = tfidf_ingredients.transform(X_balanced['ingredients_list'])
X_title_balanced = tfidf_title.transform(X_balanced['recipe_title'])

# Combine both vectorized features into one sparse matrix
X_combined_balanced = sp.hstack([X_ingredients_balanced, X_title_balanced])

italian           1793939
mexican            808277
chinese            778904
indian             114397
mediterranean       43804
southern_us         35726
spanish              5096
japanese             5027
middle eastern       1007
vietnamese            818
greek                 774
french                 38
jamaican               26
moroccan               19
brazilian              19
cajun_creole            7
Name: cuisine, dtype: int64


In [7]:
balanced_df.head(n=10)

Unnamed: 0,recipe_title,ingredients_list,instructions,cuisine,link,source,NER,ingredients_str,cluster
10,Braised Anise Beef,beef chuck garlic oil dsh pepper salt sherry s...,<ol><li>1. Leave beef whole. Crush garlic.</li...,italian,,,,beef chuck garlic oil dsh pepper salt sherry s...,14
11,Braised Beef,scallion sherry sugar garlic salt pepper peanu...,"Cut green part of scallion into 2"" pieces. Com...",italian,,,,scallion sherry sugar garlic salt pepper peanu...,14
12,Crock Pot Shredded French Dip,chuck roast garlic ground mustard seasoning be...,<ol><li>Place beef in crock pot. Add beef brot...,italian,,,,chuck roast garlic ground mustard seasoning be...,8
13,Creamy Summertime Coleslaw,apple cider vinegar cabbage chives yogurt garl...,<ol><li>Pour the contents of the pound bag of ...,italian,,,,apple cider vinegar cabbage chives yogurt garl...,14
14,Beef Tenderloin Steaks with Seared Mushrooms a...,bacon beef tenderloin steaks pepper cremini mu...,<ol><li>1. Brush the steaks on both sides with...,italian,,,,bacon beef tenderloin steaks pepper cremini mu...,14
80,Asparagus Lemon Risotto,vegetable broth water olive oil onion garlic a...,"In a large saucepan, combine broth and water. ...",italian,,,,vegetable broth water olive oil onion garlic a...,14
81,"Spring Risotto with Shrimp, Asparagus and Arti...",vegetable broth butter shallot garlic glove ar...,"In a small saucepan, bring chicken/vegetable b...",italian,,,,vegetable broth butter shallot garlic glove ar...,14
82,Risotto With Fresh Peas,chicken stock butter extra virgin olive oil on...,Melt butter in heavy large saucepan over mediu...,italian,,,,chicken stock butter extra virgin olive oil on...,12
83,Moules Marinière (Mussels with Garlic and Pars...,mussels butter leek shallot garlic bay leaves ...,"In a large saucepan with a lid, over medium he...",italian,,,,mussels butter leek shallot garlic bay leaves ...,14
84,Lemony Brown Rice With Baby Spinach,baby spinach brown rice green onions squeezes ...,<ol><li>Cook rice as directed.</li><li>Add spi...,italian,,,,baby spinach brown rice green onions squeezes ...,14


In [10]:
# Save the balanced dataset as a pickle file
balanced_df.to_pickle('balanced_recipes_dataset.pkl')

In [13]:
joblib.dump(balanced_df, 'balanced_data.pkl')

['balanced_data.pkl']