In [None]:
# Install missing libraries
!pip install wordcloud missingno plotly --quiet

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from wordcloud import WordCloud
import plotly.express as px

# Load your CSV (first upload it if needed)


# Load into DataFrame
df = pd.read_csv('/content/70000_recipes_nutrients.csv')

# Preview
print("Shape:", df.shape)
df.head()


In [None]:
# Check data types
print("\n🔹 Data types:\n", df.dtypes)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Check missing values
print("\n🔹 Missing values:\n", df.isnull().sum())

# Visualize missing data
msno.matrix(df)
plt.show()

# Drop duplicates
df = df.drop_duplicates()


In [None]:
# Summary statistics
df.describe()


In [None]:
# Nutrient column list
cols = ['calories', 'protein', 'fat', 'carbohydrates']

# Individual nutrient distribution
for col in cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30, color='green')
    plt.title(f'Distribution of {col}')
    plt.grid(True)
    plt.show()


In [None]:
# Boxplots to detect outliers
for col in cols:
    plt.figure(figsize=(6, 1.5))
    sns.boxplot(data=df, x=col, color='orange')
    plt.title(f'Boxplot for {col}')
    plt.grid(True)
    plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
# Pairplot (scatter + distribution)
sns.pairplot(df[cols])
plt.suptitle("Pairwise Relationships", y=1.02)
plt.show()


In [None]:
# Compute density per 100 calories
df['protein_density'] = df['protein'] / df['calories'] * 100
df['fat_density'] = df['fat'] / df['calories'] * 100
df['carb_density'] = df['carbohydrates'] / df['calories'] * 100

# Density boxplots
df[['protein_density', 'fat_density', 'carb_density']].plot(kind='box', figsize=(8, 5), title="Nutrient Density")
plt.grid(True)
plt.show()


In [None]:
for col in cols:
    print(f"\n🔝 Top 5 recipes by {col}:\n", df[['recipe_name', col]].sort_values(by=col, ascending=False).head(5))


In [None]:
# Join all ingredient text
text = ' '.join(df['ingredients'].astype(str).tolist())

# Generate and display word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Common Ingredients")
plt.show()


In [None]:
df_sorted = df.sort_values(by='calories', ascending=False)
fig = px.bar(df_sorted.head(10), x='recipe_name', y=['calories', 'protein', 'fat', 'carbohydrates'],
             title="Top 10 Recipes by Nutrients", barmode='group')
fig.show()


In [None]:
# Correlation with calories
print("Correlation with Calories:\n", df[cols].corr()['calories'].sort_values(ascending=False))

# Skewness
print("\nSkewness:")
print(df[cols].skew())

# Kurtosis
print("\nKurtosis:")
print(df[cols].kurtosis())
