In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Display basic info and summary statistics
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# Plot histograms for numerical features
num_cols = [col for col in df.columns if col != "PBE band gap"]
plt.figure(figsize=(16, 12))
for i, col in enumerate(num_cols, 1):
    plt.subplot(4, 4, i)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Histogram of {col}")
plt.tight_layout()
plt.show()

# Plot boxplots for numerical features to check for outliers
plt.figure(figsize=(16, 12))
for i, col in enumerate(num_cols, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()

# Visualize the distribution of the target variable: PBE band gap
plt.figure(figsize=(8, 6))
sns.histplot(df["PBE band gap"], kde=True, bins=30)
plt.title("Distribution of PBE Band Gap")
plt.xlabel("PBE Band Gap (eV)")
plt.ylabel("Frequency")
plt.show()

# Create a correlation matrix heatmap for numerical features
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Scatter plot: Ionization Energy vs. PBE Band Gap (example of pairwise relationship)
if "Ionization Energy (IE)" in df.columns:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x="Ionization Energy (IE)", y="PBE band gap", data=df)
    plt.title("Ionization Energy vs. PBE Band Gap")
    plt.xlabel("Ionization Energy (IE)")
    plt.ylabel("PBE Band Gap (eV)")
    plt.show()

# Check class distribution for the classification task (insulator vs non-insulator)
df['is_insulator'] = (df["PBE band gap"] >= 0.5).astype(int)
plt.figure(figsize=(6, 4))
sns.countplot(x='is_insulator', data=df)
plt.title("Class Distribution: Insulator (1) vs Non-Insulator (0)")
plt.xlabel("Is Insulator")
plt.ylabel("Count")
plt.xticks([0, 1], ["Non-Insulator (<0.5 eV)", "Insulator (>=0.5 eV)"])
plt.show()


Dataset Info:


NameError: name 'df' is not defined