In [1]:
# ============================================================
# 0. Imports
# ============================================================

import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, round, max
from pyspark.sql.types import NumericType

plt.close('all')
sns.set(style="whitegrid")

In [None]:
# ============================================================
# 1. Initialize Spark
# ============================================================

# Ensure SparkSession is available (avoid duplicate import if already present)
try:
    SparkSession  # type: ignore
except NameError:
    from pyspark.sql import SparkSession

#spark = SparkSession.builder.appName("Indian Panorama Analytics").getOrCreate()
spark = SparkSession.builder \
    .appName("Indian Panorama Dataset") \
    .getOrCreate()

In [None]:
# ============================================================
# 2. Load Dataset
# ============================================================

data_path = "C:\Users\HP\OneDrive\Documents\Desktop\BDA\Indian_Panorama.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Clean column names
for c in df.columns:
    if "." in c or " " in c:
        df = df.withColumnRenamed(c, c.replace(".", "").replace(" ", ""))

print("\n✅ DATA LOADED SUCCESSFULLY")
df.printSchema()
df.show(5)
print("Total Rows:", df.count())

In [None]:
# ============================================================
# 3. Clean Data
# ============================================================

df = df.dropna().dropDuplicates()
print("\n✅ Cleaned Data — nulls & duplicates removed.")
print("Remaining Rows:", df.count())

In [None]:
# ============================================================
# 4. Identify Column Types
# ============================================================

categorical_cols = [f.name for f in df.schema.fields if str(f.dataType) == "StringType"]
numerical_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, NumericType)]

print("\nCategorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

In [None]:
# ============================================================
# 5. Summary Statistics
# ============================================================

print("\n📊 Summary Statistics:")
df.describe().show()

In [None]:
# ============================================================
# 6. Convert Spark → Pandas
# ============================================================

pdf = df.toPandas()
max_rows = 2000
pdf_sample = pdf.sample(n=_builtins_.min(len(pdf), max_rows), random_state=42)

# Update column types after Pandas conversion
numerical_cols = [c for c in pdf_sample.columns if pd.api.types.is_numeric_dtype(pdf_sample[c])]
categorical_cols = [c for c in pdf_sample.columns if pdf_sample[c].dtype == 'object']

In [None]:
# ============================================================
# 7. Helper Function
# ============================================================

def safe_plot(plot_func):
    try:
        plot_func()
    except Exception as e:
        print(f"⚠ Skipping plot due to error: {e}")

In [None]:
# ============================================================
# 8. Visualization Gallery (12 Plot Types)
# ============================================================

# 1️ Line Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    plt.plot(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]], marker='o', color='orange'),
    plt.title(f'Line Plot: {numerical_cols[1]} vs {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.grid(True),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

In [None]:
# 2️ Bar Chart
safe_plot(lambda: (
    plt.figure(figsize=(9,5)),
    sns.barplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='coolwarm'),
    plt.title(f'Bar Chart: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.tight_layout(),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

In [None]:
# 3️ Histogram
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.histplot(pdf_sample[numerical_cols[0]], bins=20, kde=True, color='teal'),
    plt.title(f'Distribution of {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel('Frequency'),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 1 else None)

In [None]:
# 4️ Scatter Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    sns.scatterplot(x=numerical_cols[0], y=numerical_cols[1],
                    hue=categorical_cols[0] if categorical_cols else None,
                    data=pdf_sample, palette='Set2'),
    plt.title(f'Scatter Plot: {numerical_cols[0]} vs {numerical_cols[1]}'),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

In [None]:
# 5️ Pie Chart
safe_plot(lambda: (
    plt.figure(figsize=(6,6)),
    pdf_sample[categorical_cols[0]].value_counts().plot.pie(
        autopct='%1.1f%%', colors=sns.color_palette('pastel')),
    plt.title(f'Pie Chart: Distribution of {categorical_cols[0]}'),
    plt.ylabel(''),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 else None)

In [None]:
# 6️ Box Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.boxplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='Set3'),
    plt.title(f'Box Plot: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

In [None]:
# 7️ Heatmap
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    sns.heatmap(pdf_sample[numerical_cols].corr(), annot=True, cmap='coolwarm'),
    plt.title('Correlation Heatmap'),
    plt.show(),
    plt.close()
) if len(numerical_cols) > 1 else None)

In [None]:
# 8️ Area Chart
safe_plot(lambda: (
    plt.figure(figsize=(9,5)),
    plt.stackplot(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]], colors=['lightcoral']),
    plt.title(f'Area Chart: {numerical_cols[1]} vs {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

In [None]:
# 9️ Bubble Chart
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    plt.scatter(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]],
                s=pdf_sample[numerical_cols[2]]*10, alpha=0.5, color='purple'),
    plt.title(f'Bubble Chart: {numerical_cols[0]} vs {numerical_cols[1]} (size={numerical_cols[2]})'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 3 else None)

In [None]:
# 10 Violin Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.violinplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='muted'),
    plt.title(f'Violin Plot: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

In [None]:
# 1️1️ Pair Plot
safe_plot(lambda: (
    sns.pairplot(pdf_sample[numerical_cols[:3]], height=2.3),
    plt.suptitle("Pairwise Relationships", y=1.02),
    plt.show()
) if len(numerical_cols) >= 3 else None)

In [None]:
# 1️2️ 3D Scatter Plot
def plot_3d_scatter():
    fig = plt.figure(figsize=(7,5))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]],
               pdf_sample[numerical_cols[2]], c='blue', s=50, alpha=0.6)
    ax.set_xlabel(numerical_cols[0])
    ax.set_ylabel(numerical_cols[1])
    ax.set_zlabel(numerical_cols[2])
    ax.set_title("3D Scatter Plot")
    plt.show()
    plt.close()

safe_plot(plot_3d_scatter if len(numerical_cols) >= 3 else lambda: None)

# ============================================================
#  Spark Session Stop (optional)
# ============================================================

# spark.stop()
# print("\n Spark session stopped successfully.")

In [None]:
# ============================================================
# 8. Visualization Gallery (12 Plot Types) - Robust Version
# ============================================================

import matplotlib.pyplot as plt
from pyspark.sql.types import NumericType

# Ensure Spark and Pandas are available
try:
    df
except NameError:
    # Dummy fallback: create a sample Spark DataFrame
    import pandas as pd
    import numpy as np
    import seaborn as sns
    from pyspark.sql import SparkSession

    spark = SparkSession.builder.getOrCreate()
    # Create a sample pandas DataFrame
    pd_df = pd.DataFrame({
        'Category': np.random.choice(['A', 'B', 'C'], size=100),
        'Value1': np.random.randn(100) * 10 + 50,
        'Value2': np.random.randn(100) * 5 + 20,
        'Value3': np.random.randint(1, 100, size=100)
    })
    df = spark.createDataFrame(pd_df)

plt.close('all')  # ensure clean start

import builtins  # Add this import at the top if not already present

# Sample data if large
max_rows = 2000
# Convert Spark DataFrame to Pandas DataFrame if not already done
pdf = df.toPandas()
pdf_sample = pdf.sample(n=builtins.min(len(pdf), max_rows), random_state=42)

# Recompute column types after Spark → Pandas conversion
numerical_cols = [c for c in pdf_sample.columns if pd.api.types.is_numeric_dtype(pdf_sample[c])]
categorical_cols = [c for c in pdf_sample.columns if pdf_sample[c].dtype == 'object']
# ---------------------------
# 4. Clean Data
# ---------------------------
df = df.dropna().dropDuplicates()
print("\n Cleaned Data — nulls & duplicates removed.")
print("Remaining Rows:", df.count())
# ---------------------------
# 5. Identify Column Types
# ---------------------------
categorical_cols = [f.name for f in df.schema.fields if str(f.dataType) == "StringType"]
numerical_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, NumericType)]

print("\nCategorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)
# ---------------------------
# 6. Summary Statistics
# ---------------------------
print("\n Summary Statistics:")
df.describe().show()
# ---------------------------
# 3. Load Dataset
# ---------------------------
data_path = "Indian_Panorama.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Clean column names
for c in df.columns:
    if "." in c or " " in c:
        df = df.withColumnRenamed(c, c.replace(".", "").replace(" ", ""))

print("\n DATA LOADED SUCCESSFULLY")
df.printSchema()
df.show(5)
print("Total Rows:", df.count())

# Helper function to safely plot
def safe_plot(plot_func):
    try:
        plot_func()
    except Exception as e:
        print(f"⚠ Skipping plot due to error: {e}")

# 1️⃣ Line Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    plt.plot(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]], marker='o', color='orange'),
    plt.title(f'Line Plot: {numerical_cols[1]} vs {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.grid(True),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

# 2️⃣ Bar Chart
safe_plot(lambda: (
    plt.figure(figsize=(9,5)),
    sns.barplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='coolwarm'),
    plt.title(f'Bar Chart: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.tight_layout(),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

# 3️⃣ Histogram
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.histplot(pdf_sample[numerical_cols[0]], bins=20, kde=True, color='teal'),
    plt.title(f'Distribution of {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel('Frequency'),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 1 else None)

# 4️⃣ Scatter Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    sns.scatterplot(x=numerical_cols[0], y=numerical_cols[1],
                    hue=categorical_cols[0] if categorical_cols else None,
                    data=pdf_sample, palette='Set2'),
    plt.title(f'Scatter Plot: {numerical_cols[0]} vs {numerical_cols[1]}'),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

# 5️⃣ Pie Chart
safe_plot(lambda: (
    plt.figure(figsize=(6,6)),
    pdf_sample[categorical_cols[0]].value_counts().plot.pie(
        autopct='%1.1f%%', colors=sns.color_palette('pastel')),
    plt.title(f'Pie Chart: Distribution of {categorical_cols[0]}'),
    plt.ylabel(''),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 else None)

# 6️⃣ Box Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.boxplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='Set3'),
    plt.title(f'Box Plot: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

# 7️⃣ Heatmap
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    sns.heatmap(pdf_sample[numerical_cols].corr(), annot=True, cmap='coolwarm'),
    plt.title('Correlation Heatmap'),
    plt.show(),
    plt.close()
) if len(numerical_cols) > 1 else None)

# 8️⃣ Area Chart
safe_plot(lambda: (
    plt.figure(figsize=(9,5)),
    plt.stackplot(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]], colors=['lightcoral']),
    plt.title(f'Area Chart: {numerical_cols[1]} vs {numerical_cols[0]}'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 2 else None)

# 9️⃣ Bubble Chart
safe_plot(lambda: (
    plt.figure(figsize=(8,6)),
    plt.scatter(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]],
                s=pdf_sample[numerical_cols[2]]*10, alpha=0.5, color='purple'),
    plt.title(f'Bubble Chart: {numerical_cols[0]} vs {numerical_cols[1]} (size={numerical_cols[2]})'),
    plt.xlabel(numerical_cols[0]),
    plt.ylabel(numerical_cols[1]),
    plt.show(),
    plt.close()
) if len(numerical_cols) >= 3 else None)

# 🔟 Violin Plot
safe_plot(lambda: (
    plt.figure(figsize=(8,5)),
    sns.violinplot(x=categorical_cols[0], y=numerical_cols[0], data=pdf_sample, palette='muted'),
    plt.title(f'Violin Plot: {numerical_cols[0]} by {categorical_cols[0]}'),
    plt.xticks(rotation=45),
    plt.show(),
    plt.close()
) if len(categorical_cols) >= 1 and len(numerical_cols) >= 1 else None)

# 1️⃣1️⃣ Pair Plot
safe_plot(lambda: (
    sns.pairplot(pdf_sample[numerical_cols[:3]], height=2.3),
    plt.suptitle("Pairwise Relationships", y=1.02),
    plt.show()
) if len(numerical_cols) >= 3 else None)

# 1️⃣2️⃣ 3D Scatter Plot
def plot_3d_scatter():
    fig = plt.figure(figsize=(7,5))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(pdf_sample[numerical_cols[0]], pdf_sample[numerical_cols[1]],
               pdf_sample[numerical_cols[2]], c='blue', s=50, alpha=0.6)
    ax.set_xlabel(numerical_cols[0])
    ax.set_ylabel(numerical_cols[1])
    ax.set_zlabel(numerical_cols[2])
    ax.set_title("3D Scatter Plot")
    plt.show()
    plt.close()

safe_plot(plot_3d_scatter if len(numerical_cols) >= 3 else lambda: None)