In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import parallel_coordinates
import matplotlib.pyplot as plt
import seaborn as sns
import os
import builtins
import re

from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [3]:
def additional_data_preprocessing(df):
    columns_to_drop = ["svnrevision", "date", "time", "onhold", "duplicateof", "year"]
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].median())

    for col in df.select_dtypes(include=[object]).columns:
        df[col] = df[col].fillna("Unknown")

    if all(col in df.columns for col in ["a", "c"]):
        df["cell_anisotropy"] = np.abs(df["a"] - df["c"]) / (df["a"] + df["c"])

    numerical_cols = ["a", "b", "c", "alpha", "beta", "gamma", "vol", "cell_anisotropy"]
    numerical_cols = [col for col in numerical_cols if col in df.columns]
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    encode_cols = [col for col in df.select_dtypes(include='object').columns if col not in ['file']]
    for col in encode_cols:
        encoded_col_name = f"{col}_encoded"
        encoder = LabelEncoder()
        try:
            df[encoded_col_name] = encoder.fit_transform(df[col].astype(str))
        except Exception:
            df[encoded_col_name] = -1

    return df

In [4]:
def preprocess_dataframe(df):
    df_clean = df.copy()
    df_clean.dropna(axis=1, how='all', inplace=True)
    df_clean.dropna(axis=0, how='all', inplace=True)

    threshold = 0.5
    missing_fraction = df_clean.isnull().mean()
    cols_to_drop = missing_fraction[missing_fraction > threshold].index.tolist()
    cols_to_keep = ['file']
    cols_to_drop = [col for col in cols_to_drop if col not in cols_to_keep]
    df_clean.drop(columns=cols_to_drop, inplace=True)

    for col in df_clean.select_dtypes(include=[np.number]).columns:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

    for col in df_clean.select_dtypes(include=['object']).columns:
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

    df_clean = additional_data_preprocessing(df_clean)

    return df_clean

In [5]:
def plot_limited_categorical_distributions_log(df, max_columns=5, top_n=10):
    cat_cols = df.select_dtypes(include='object').columns[:max_columns]
    for col in cat_cols:
        value_counts = df[col].value_counts().sort_values(ascending=False).head(top_n)
        plt.figure(figsize=(8, 4))
        sns.barplot(x=value_counts.index, y=value_counts.values, palette='viridis')
        plt.yscale('log')
        plt.title(f"Top {top_n} Values in '{col}' (Log Scale)")
        plt.xlabel(col)
        plt.ylabel("Log Count")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

In [None]:
def plot_correlation_heatmap(df, title="Correlation of Physical Properties"):
    numeric_df = df.select_dtypes(include=[float, int])
    corr = numeric_df.corr()

    plt.figure(figsize=(32, 20))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title(title, fontsize=16)
    plt.tight_layout()
    plt.show()

def plot_scatter_matrix(df, columns, title="Scatter Plot Matrix of Selected Properties"):
    sns.pairplot(df[columns], corner=True, plot_kws={'alpha': 0.6})
    plt.suptitle(title, y=1.02, fontsize=16)
    plt.show()

def plot_anisotropy_vs_volume(df):
    if "cell_anisotropy" in df.columns and "vol" in df.columns:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df, x="cell_anisotropy", y="vol", alpha=0.6)
        plt.title("Cell Anisotropy vs Volume", fontsize=14)
        plt.xlabel("Cell Anisotropy")
        plt.ylabel("Volume (Å³)")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

def plot_angle_distributions(df):
    angles = ["alpha", "beta", "gamma"]
    melted = df[angles].melt(var_name="Angle", value_name="Degrees")
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=melted, x="Angle", y="Degrees", palette='Set2')
    plt.title("Distribution of Unit Cell Angles", fontsize=14)
    plt.grid(True, axis='y')
    plt.tight_layout()
    plt.show()

def plot_normalized_axis_ratios(df):
    if all(col in df.columns for col in ["a", "b", "c"]):
        df_ratios = df.copy()
        df_ratios["b/a"] = df_ratios["b"] / df_ratios["a"]
        df_ratios["c/a"] = df_ratios["c"] / df_ratios["a"]

        plt.figure(figsize=(10, 6))
        sns.kdeplot(df_ratios["b/a"], fill=True, label="b/a", linewidth=2)
        sns.kdeplot(df_ratios["c/a"], fill=True, label="c/a", linewidth=2)

        plt.xlim(-400, 400)
        plt.title("Normalized Axis Ratios", fontsize=14)
        plt.xlabel("Ratio")
        plt.ylabel("Density")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [None]:
def plot_volume_vs_angles(df):
    angles = ['alpha', 'beta', 'gamma']
    _, axs = plt.subplots(1, 3, figsize=(18, 6))
    for i, angle in enumerate(angles):
        sns.scatterplot(data=df, x=angle, y='vol', ax=axs[i], alpha=0.6)
        axs[i].set_title(f'{angle} vs Volume')
        axs[i].grid(True)
    plt.tight_layout()
    plt.show()

def plot_symmetry_vs_anisotropy(df, min_samples=10):
    if 'sg_encoded' in df.columns and 'cell_anisotropy' in df.columns:
        counts = df['sg_encoded'].value_counts()
        valid_sgs = counts[counts >= min_samples].index
        filtered_df = df[df['sg_encoded'].isin(valid_sgs)]

        plt.figure(figsize=(12, 8))
        sns.boxplot(x='sg_encoded', y='cell_anisotropy', data=filtered_df)
        plt.yscale('log')
        plt.xticks(rotation=90)
        plt.title("Anisotropy by Space Group (Log Scale, Filtered)")
        plt.grid(True, axis='y')
        plt.tight_layout()
        plt.show()

In [None]:
def plot_axis_ratio_vs_symmetry(df, top_n=15):
    df_ratio = df.copy()

    for col in ['a', 'b', 'c']:
        df_ratio[col] = pd.to_numeric(df_ratio[col], errors='coerce')
    df_ratio.dropna(subset=['a', 'b', 'c'], inplace=True)
    df_ratio = df_ratio[df_ratio['a'] != 0]
    df_ratio['b/a'] = df_ratio['b'] / df_ratio['a']
    df_ratio['c/a'] = df_ratio['c'] / df_ratio['a']
    df_ratio = df_ratio.replace([np.inf, -np.inf], np.nan).dropna(subset=['b/a', 'c/a'])

    top_sgs = df_ratio['sg_encoded'].value_counts().nlargest(top_n).index
    df_ratio = df_ratio[df_ratio['sg_encoded'].isin(top_sgs)]

    # Plot
    fig, axs = plt.subplots(1, 2, figsize=(16, 6))
    sns.boxplot(data=df_ratio, x='sg_encoded', y='b/a', ax=axs[0])
    axs[0].set_title("b/a Ratio by Space Group")
    axs[0].tick_params(axis='x', rotation=90)
    axs[0].grid(True)

    sns.boxplot(data=df_ratio, x='sg_encoded', y='c/a', ax=axs[1])
    axs[1].set_title("c/a Ratio by Space Group")
    axs[1].tick_params(axis='x', rotation=90)
    axs[1].grid(True)

    plt.tight_layout()
    plt.show()

def plot_volume_distribution_by_symmetry(df, top_n=20):
    top_groups = df['sg_encoded'].value_counts().nlargest(top_n).index
    df_top = df[df['sg_encoded'].isin(top_groups)]

    plt.figure(figsize=(14, 8))
    sns.boxplot(data=df_top, x='sg_encoded', y='vol')
    plt.xticks(rotation=90)
    plt.ylim(0, 10) 
    plt.title("Volume Distribution for Top Space Groups (Clipped at 20)")
    plt.grid(True, axis='y')
    plt.tight_layout()
    plt.show()

In [None]:
def plot_crystal_density_vs_volume(df, top_n=15):
    if 'vol' in df.columns and 'sg_encoded' in df.columns:
        df_filtered = df[df['vol'] > 0].copy()
        df_filtered['density'] = 1 / df_filtered['vol']

        top_sgs = df_filtered['sg_encoded'].value_counts().nlargest(top_n).index
        df_filtered = df_filtered[df_filtered['sg_encoded'].isin(top_sgs)]

        plt.figure(figsize=(12, 6))
        sns.boxplot(data=df_filtered, x='sg_encoded', y='density')
        plt.yscale('log')
        plt.title(f"Inverse Volume (Density Proxy) by Top {top_n} Space Groups")
        plt.xlabel("Space Group (Encoded)")
        plt.ylabel("1 / Volume")
        plt.grid(True, axis='y')
        plt.tight_layout()
        plt.show()

In [10]:
def plot_volume_vs_cell_shape_factor(df):
    if all(col in df.columns for col in ['a', 'b', 'c', 'vol']):
        df_shape = df.copy()
        df_shape['shape_factor'] = (df_shape['a'] * df_shape['b'] * df_shape['c']) / df_shape['vol']
        df_shape = df_shape.replace([np.inf, -np.inf], np.nan).dropna(subset=['shape_factor'])

        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df_shape, x='vol', y='shape_factor', alpha=0.6)
        plt.title("Volume vs Cell Shape Factor")
        plt.xlabel("Volume (Å³)")
        plt.ylabel("(a·b·c) / Volume")
        plt.yscale('log')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [11]:
def plot_angle_correlation_heatmap(df):
    if all(col in df.columns for col in ['alpha', 'beta', 'gamma']):
        angle_df = df[['alpha', 'beta', 'gamma']]
        corr = angle_df.corr()
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr, annot=True, cmap='RdBu', fmt=".2f", vmin=-1, vmax=1)
        plt.title("Correlation Between Unit Cell Angles")
        plt.tight_layout()
        plt.show()

In [None]:
def plot_anisotropy_distribution(df):
    if 'cell_anisotropy' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df['cell_anisotropy'], bins=50, kde=True, log_scale=(False, True), color='orange')
        plt.title("Distribution of Cell Anisotropy (Log-Scaled Y-Axis)")
        plt.xlabel("Cell Anisotropy")
        plt.ylabel("Count (Log Scale)")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [None]:
df = pd.read_csv(r"./data/COD-selection.csv")
df_processed = preprocess_dataframe(df)
df_processed.describe(include='all')

Unnamed: 0,file,a,siga,b,sigb,c,sigc,alpha,beta,sigbeta,...,cellformula_encoded,authors_encoded,title_encoded,journal_encoded,issue_encoded,firstpage_encoded,lastpage_encoded,doi_encoded,radType_encoded,flags_encoded
count,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,...,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0,11328.0
unique,,,,,,,,,,,...,,,,,,,,,,
top,,,,,,,,,,,...,,,,,,,,,,
freq,,,,,,,,,,,...,,,,,,,,,,
mean,5626422.0,1.631032e-16,0.001605,1.37641e-16,0.001538,1.173536e-16,0.00192,-6.257813e-16,-1.912234e-15,0.005113,...,5212.272334,1902.979167,2046.044227,28.000706,61.429643,1651.125,1629.84684,1904.02101,11.991879,0.695798
std,2640153.0,1.000044,0.010827,1.000044,0.008077,1.000044,0.008462,1.000044,1.000044,0.021677,...,2990.308789,1110.585971,1116.450069,15.754506,41.037778,913.955079,900.51796,1098.559669,5.937993,0.955241
min,1569774.0,-1.35674,1.2e-05,-1.72011,1.4e-05,-1.764257,1.5e-05,-3.995312,-3.317381,0.0003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2023027.0,-0.6088665,0.0003,-0.6496436,0.0003,-0.6336162,0.0004,0.01874866,-0.5285878,0.002,...,2624.75,933.75,1120.0,18.0,28.0,946.0,938.0,967.75,4.0,0.0
50%,7133884.0,-0.263098,0.0006,-0.2071389,0.0006,-0.1752117,0.0008,0.01874866,-0.1570563,0.003,...,5238.5,1890.0,2081.5,26.0,56.0,1519.0,1505.0,1945.5,15.0,0.0
75%,7714795.0,0.2820521,0.0012,0.4048113,0.0013,0.4046598,0.0016,0.01874866,0.621295,0.004,...,7789.25,2845.0,3027.0,27.0,89.0,2401.0,2373.0,2836.25,15.0,2.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11328 entries, 0 to 11327
Data columns (total 73 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   file              11328 non-null  int64  
 1   a                 11328 non-null  float64
 2   siga              11282 non-null  float64
 3   b                 11328 non-null  float64
 4   sigb              11251 non-null  float64
 5   c                 11328 non-null  float64
 6   sigc              11282 non-null  float64
 7   alpha             11328 non-null  float64
 8   sigalpha          3250 non-null   float64
 9   beta              11328 non-null  float64
 10  sigbeta           8843 non-null   float64
 11  gamma             11328 non-null  float64
 12  siggamma          3253 non-null   float64
 13  vol               11328 non-null  float64
 14  sigvol            11283 non-null  float64
 15  celltemp          11288 non-null  float64
 16  sigcelltemp       8025 non-null   float6

In [15]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11328 entries, 0 to 11327
Data columns (total 57 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   file                 11328 non-null  int64  
 1   a                    11328 non-null  float64
 2   siga                 11328 non-null  float64
 3   b                    11328 non-null  float64
 4   sigb                 11328 non-null  float64
 5   c                    11328 non-null  float64
 6   sigc                 11328 non-null  float64
 7   alpha                11328 non-null  float64
 8   beta                 11328 non-null  float64
 9   sigbeta              11328 non-null  float64
 10  gamma                11328 non-null  float64
 11  vol                  11328 non-null  float64
 12  sigvol               11328 non-null  float64
 13  celltemp             11328 non-null  float64
 14  sigcelltemp          11328 non-null  float64
 15  diffrtemp            11328 non-null 

In [None]:
def plot_lattice_system_vs_anisotropy(df):
    lattice_map = {
        "Triclinic": range(1, 3),
        "Monoclinic": range(3, 16),
        "Orthorhombic": range(16, 75),
        "Tetragonal": range(75, 143),
        "Trigonal": range(143, 168),
        "Hexagonal": range(168, 195),
        "Cubic": range(195, 231)
    }

    def get_lattice_system(sg_number):
        for system, sg_range in lattice_map.items():
            if sg_number in sg_range:
                return system
        return "Unknown"

    if 'sg_encoded' in df.columns and 'cell_anisotropy' in df.columns:
        df_copy = df.copy()
        df_copy['lattice_system'] = df_copy['sg_encoded'].apply(get_lattice_system)

        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df_copy, x='lattice_system', y='cell_anisotropy')
        plt.ylim([-2, 4])
        plt.title("Anisotropy by Lattice System (Log Scale)")
        plt.xlabel("Lattice System")
        plt.ylabel("Cell Anisotropy")
        plt.grid(True, axis='y')
        plt.tight_layout()
        plt.show()

In [None]:
def plot_kde_volume_vs_anisotropy(df):
    df_filtered = df[(df['vol'] > 0) & (df['vol'] <= 10) & (df['cell_anisotropy'] >= 0)].copy()

    plt.figure(figsize=(10, 6))
    sns.kdeplot(
        data=df_filtered,
        x='cell_anisotropy',
        y='vol',
        fill=True,
        cmap='magma',
        levels=100,
        thresh=0.01,
        cbar=True
    )

    plt.title("KDE Density of Volume vs Anisotropy (Volume ≤ 10 Å³)")
    plt.xlabel("Cell Anisotropy")
    plt.ylabel("Volume (Å³)")
    plt.ylim(0, 10)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_angle_distribution_by_lattice(df):
    df_copy = df.copy()

    lattice_map = {
        "Triclinic": range(1, 3),
        "Monoclinic": range(3, 16),
        "Orthorhombic": range(16, 75),
        "Tetragonal": range(75, 143),
        "Trigonal": range(143, 168),
        "Hexagonal": range(168, 195),
        "Cubic": range(195, 231)
    }

    def get_lattice_system(sg_number):
        for system, sg_range in lattice_map.items():
            if sg_number in sg_range:
                return system
        return "Unknown"

    df_copy['lattice_system'] = df_copy['sg_encoded'].apply(get_lattice_system)

    angle_cols = ['alpha', 'beta', 'gamma']
    df_melted = df_copy.melt(id_vars='lattice_system', value_vars=angle_cols,
                             var_name='Angle', value_name='Degrees')

    plt.figure(figsize=(14, 6))
    sns.violinplot(data=df_melted, x='Angle', y='Degrees', hue='lattice_system', inner='quartile')
    plt.title("Distribution of Unit Cell Angles by Lattice System")
    plt.ylabel("Angle (Degrees)")
    plt.xlabel("Angle Type")
    plt.legend(title="Lattice System", bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [113]:
plot_lattice_system_vs_anisotropy(df_processed)

In [114]:
plot_limited_categorical_distributions_log(df_processed, max_columns=4, top_n=10)

In [115]:
df_numeric = df_processed.drop(columns=["file"], errors='ignore')
df_numeric = df_numeric.select_dtypes(include=[float, int])
plot_correlation_heatmap(df_numeric)

In [116]:
plot_scatter_matrix(df_processed, columns=["a", "b", "c", "vol", "cell_anisotropy"])

In [117]:
plot_anisotropy_vs_volume(df_processed)

In [118]:

plot_normalized_axis_ratios(df_processed)

In [119]:
plot_angle_distributions(df_processed)


In [120]:
plot_volume_vs_angles(df_processed)

In [121]:
plot_symmetry_vs_anisotropy(df_processed)

In [122]:
plot_axis_ratio_vs_symmetry(df_processed)

In [123]:
plot_volume_distribution_by_symmetry(df_processed, top_n=20)

In [124]:
plot_crystal_density_vs_volume(df_processed)

In [125]:
plot_volume_vs_cell_shape_factor(df_processed)

In [126]:
plot_angle_correlation_heatmap(df_processed)

In [127]:
plot_anisotropy_distribution(df_processed)

In [129]:
plot_kde_volume_vs_anisotropy(df_processed)

In [131]:
plot_angle_distribution_by_lattice(df_processed)

In [None]:
# df_processed.to_csv("./processed_csv.csv")
# print("Processed data has been saved successfully!")