# AI with clustered data

In [None]:
def material_clustered_split_y6(MF, n_clusters, name, y6_labels, cluster_label="Predicted Cluster"):
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    # PCA + KMeans pipeline
    preprocessor = Pipeline([
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=2, random_state=22))
    ])
    clusterer = Pipeline([
        ("kmeans", KMeans(n_clusters=n_clusters, init="k-means++", n_init=50, max_iter=500, random_state=22))
    ])
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("clusterer", clusterer)
    ])

    pipe.fit(MF)
    preprocessed_data = pipe["preprocessor"].transform(MF)
    predicted_labels = pipe["clusterer"]["kmeans"].labels_
    print(f'silhouette score: {silhouette_score(preprocessed_data, predicted_labels):.3f}')

    # dataframe
    pcadf = pd.DataFrame(preprocessed_data, columns=["component 1", "component 2"])
    pcadf[cluster_label] = predicted_labels
    pcadf["Acceptor_Label"] = y6_labels

    sns.set_theme(style="white")
    font = {'family': 'Arial', 'weight': 'normal', 'size': 8}
    plt.rc('font', **font)

    for label in pcadf["Acceptor_Label"].unique():
        subset = pcadf[pcadf["Acceptor_Label"] == label]
        plt.figure(figsize=(6, 6))
        sns.scatterplot(
            x="component 1", y="component 2", s=40,
            data=subset, hue=cluster_label, palette='tab10'
        )
        ax = plt.gca()
        ax.grid(True)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        for spine in ax.spines.values():
            spine.set_visible(True)
            spine.set_linewidth(1.5)
        ax.tick_params(axis='both', which='major', length=7, width=1.25, direction='in')
        legend = ax.get_legend()
        ax.legend_.remove()
        plt.xlabel("Component 1 (Acceptor-dominated)")
        plt.ylabel("Component 2 (Donor-dominated)")
        clean_label = label.replace(" ", "_")
        plt.savefig(f'./cluster/{name}_{clean_label}.png', bbox_inches='tight', dpi=600, facecolor='white')
        plt.show()
        
        fig_leg = plt.figure(figsize=(1, 2))
        plt.rc('font', **font)
        ax_leg = fig_leg.add_subplot(111)
        ax_leg.axis("off")

        new_leg = Legend(ax_leg, legend.legendHandles, [t.get_text() for t in legend.get_texts()],
                         loc='center', frameon=False, handlelength=2, ncol=1)
        ax_leg.add_artist(new_leg)

        fig_leg.savefig(f'./cluster/{name}_{clean_label}_legend.png', dpi=600, bbox_inches='tight', facecolor='white')
        plt.close(fig_leg)
        
    return pcadf


In [None]:
def material_clustered_split_y6_fixed_full(MF, n_clusters, name, y6_labels, cluster_label="Predicted Cluster"):
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from matplotlib.legend import Legend
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    # 固定顏色對應 palette（0~8 預設支援）
    palette_fixed = {
        0: "#1f77b4", 1: "#ff7f0e", 2: "#2ca02c", 3: "#d62728",
        4: "#9467bd", 5: "#8c564b", 6: "#e377c2", 7: "#7f7f7f", 8: "#bcbd22"
    }
    marker_dict = {
        0: 'o', 1: 'X', 2: 's', 3: 'P', 4: 'D',
        5: 'd', 6: '^', 7: 'H', 8: '*'
    }

    pipe = Pipeline([
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=2, random_state=22)),
        ("cluster", KMeans(n_clusters=n_clusters, init="k-means++", n_init=50, max_iter=500, random_state=22))
    ])
    pc = pipe.named_steps["pca"]
    X_pca = pipe.named_steps["pca"].fit_transform(pipe.named_steps["scaler"].fit_transform(MF))
    labels = pipe.named_steps["cluster"].fit_predict(X_pca)
    sil_score = silhouette_score(X_pca, labels)
    print(f'Silhouette score: {sil_score:.3f}')

    df = pd.DataFrame(X_pca, columns=["component 1", "component 2"])
    df[cluster_label] = labels
    df["Acceptor_Label"] = y6_labels

    xlim = (df["component 1"].min() - 0.5, df["component 1"].max() + 0.5)
    ylim = (df["component 2"].min() - 0.5, df["component 2"].max() + 0.5)

    sns.set_theme(style="white")
    font = {'family': 'Arial', 'weight': 'normal', 'size': 8}
    plt.rc('font', **font)

    for label in df["Acceptor_Label"].unique():
        subset = df[df["Acceptor_Label"] == label]
        plt.figure(figsize=(6, 6))
        ax = sns.scatterplot(
            x="component 1", y="component 2",
            data=subset, hue=cluster_label, style=cluster_label, palette=palette_fixed, markers=marker_dict, s=40
        )
        ax.grid(True)
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        for spine in ax.spines.values():
            spine.set_visible(True)
            spine.set_linewidth(1.5)
        ax.tick_params(axis='both', which='major', length=7, width=1.25, direction='in')
        legend = ax.get_legend()
        ax.legend_.remove()
        plt.xlabel("Component 1 (Acceptor-dominated)")
        plt.ylabel("Component 2 (Donor-dominated)")
        label_tag = label.replace(" ", "_")
        plt.savefig(f"./cluster/{name}_{label_tag}.png", bbox_inches='tight', dpi=600, facecolor='white')
        plt.show()

        fig_leg = plt.figure(figsize=(1.2, 2))
        ax_leg = fig_leg.add_subplot(111)
        ax_leg.axis("off")
        new_leg = Legend(ax_leg, legend.legendHandles, [t.get_text() for t in legend.get_texts()],
                         loc='center', frameon=False, handlelength=2, ncol=1)
        ax_leg.add_artist(new_leg)
        fig_leg.savefig(f"./cluster/{name}_{label_tag}_legend.png", dpi=600, bbox_inches='tight', facecolor='white')
        plt.close(fig_leg)

    return df

In [None]:
import numpy as np
import pandas as pd

In [None]:
def material_clustered_split_y6_fixed_full(MF, n_clusters, name, y6_labels, cluster_label="Predicted Cluster"):
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from matplotlib.legend import Legend
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    palette_fixed = {
        0: "#1f77b4", 1: "#ff7f0e", 2: "#2ca02c", 3: "#d62728",
        4: "#9467bd", 5: "#8c564b", 6: "#e377c2", 7: "#7f7f7f", 8: "#bcbd22"
    }

    pipe = Pipeline([
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=2, random_state=22)),
        ("cluster", KMeans(n_clusters=n_clusters, init="k-means++", n_init=50, max_iter=500, random_state=22))
    ])
    pc = pipe.named_steps["pca"]
    X_pca = pipe.named_steps["pca"].fit_transform(pipe.named_steps["scaler"].fit_transform(MF))
    labels = pipe.named_steps["cluster"].fit_predict(X_pca)
    sil_score = silhouette_score(X_pca, labels)
    print(f'Silhouette score: {sil_score:.3f}')

    df = pd.DataFrame(X_pca, columns=["component 1", "component 2"])
    df[cluster_label] = labels
    df["Acceptor_Label"] = y6_labels

    xlim = (df["component 1"].min() - 0.5, df["component 1"].max() + 0.5)
    ylim = (df["component 2"].min() - 0.5, df["component 2"].max() + 0.5)

    sns.set_theme(style="white")
    font = {'family': 'Arial', 'weight': 'normal', 'size': 8}
    plt.rc('font', **font)

    for label in df["Acceptor_Label"].unique():
        subset = df[df["Acceptor_Label"] == label]
        plt.figure(figsize=(6, 6))
        ax = sns.scatterplot(
            x="component 1", y="component 2",
            data=subset, hue=cluster_label, palette=palette_fixed, s=40
        )
        ax.grid(True)
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        for spine in ax.spines.values():
            spine.set_visible(True)
            spine.set_linewidth(1.5)
        ax.tick_params(axis='both', which='major', length=7, width=1.25, direction='in')
        legend = ax.get_legend()
        ax.legend_.remove()
        plt.xlabel("Component 1 (Acceptor-dominated)")
        plt.ylabel("Component 2 (Donor-dominated)")
        label_tag = label.replace(" ", "_")
        plt.savefig(f"./cluster2/{name}_{label_tag}.png", bbox_inches='tight', dpi=600, facecolor='white')
        plt.show()

        fig_leg = plt.figure(figsize=(1.2, 2))
        ax_leg = fig_leg.add_subplot(111)
        ax_leg.axis("off")
        new_leg = Legend(ax_leg, legend.legendHandles, [t.get_text() for t in legend.get_texts()],
                         loc='center', frameon=False, handlelength=2, ncol=1)
        ax_leg.add_artist(new_leg)
        fig_leg.savefig(f"./cluster2/{name}_{label_tag}_legend.png", dpi=600, bbox_inches='tight', facecolor='white')
        plt.close(fig_leg)

    return df

In [None]:
MF = np.load("MF.npy")
df = pd.read_excel("y6_cat_or_database.xlsx")
y6_labels = df["Acceptor_Label"].tolist()

material_clustered_split_y6_fixed_full(
    MF=MF[:, :8192],      
    n_clusters=9,
    name="Donor9_Y6fixed",
    y6_labels=y6_labels
)

In [None]:
material_clustered_split_y6_fixed_full(
    MF=MF[:, 8192:],     
    n_clusters=6,
    name="Acceptor6_Y6fixed",
    y6_labels=y6_labels
)

In [None]:
material_clustered_split_y6_fixed_full(
    MF=MF,          
    n_clusters=7,
    name="Device7_Y6fixed",
    y6_labels=y6_labels
)

In [None]:
MF = np.load("MF.npy")
df_y6 = pd.read_excel("y6_cat_database.xlsx")
y6_labels = df_y6["Acceptor_Label"].tolist()

material_clustered_split_y6(MF[:, :8192], n_clusters=9, name="Donor9_Y6split", y6_labels=y6_labels)

In [None]:
material_clustered_split_y6(MF[:, 8192:], n_clusters=6, name="Acceptor6_Y6split", y6_labels=y6_labels)

In [None]:
material_clustered_split_y6(MF, n_clusters=7, name="Device7_Y6split", y6_labels=y6_labels)

In [None]:
# Data, Plot and Statistics
import os
import shutil
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator, MaxNLocator
import seaborn as sns
import statsmodels.api as sm
import six
from sklearn.model_selection import train_test_split
# Descripotr Transformation
from rdkit import Chem
from rdkit.Chem import AllChem
import cv2
import codecs
# Machine Leaning
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
read_PATH = './database/opv_ai_database_used.xlsx'
file = pd.read_excel(read_PATH)
file.head() # First 5 raws of raws of data

In [None]:
def draw_save_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='k',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')
    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)
    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)
    for k, cell in six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors)])
    return ax

In [None]:
### dropout the unused column
raw_row = file.shape[0]
raw_col = file.shape[1]
print(f'Raw data, rows:{raw_row}, columns:{raw_col}')
#file = file.drop("silence", axis=1)
#file = file.drop("ratio A/D", axis=1)
print(f'Data without unused columns, rows:{file.shape[0]}, columns:{file.shape[1]}')
### dropout the raw with blank data (N/A)
file = pd.DataFrame(file)
file = file.dropna()
final_row = file.shape[0]
final_col = file.shape[1]
print(f'Final data (real input), rows:{final_row}, columns:{final_col}')
### table layout
dictionary = {'Data':['Raw','Final'],
              'Rows':[raw_row, final_row],
              'Columns':[raw_col,final_col]}
df_stat = pd.DataFrame(dictionary)
draw_save_table(df_stat, header_columns=0, col_width=2.0)

In [None]:
file.info() # check data in each column is in the right form

In [None]:
i = file.shape[0]
index = np.array(file['Index']).reshape(i,)
### Donor material
donor_name = np.array(file['Donor']).reshape(i,)
donor_smiles = np.array(file['Donor SMILES']).reshape(i,)
donor_homo = np.array(file['HOMO of Donor (eV)']).reshape(i,)
donor_lumo = np.array(file['LUMO of Donor (eV)']).reshape(i,)
donor_bandgap = np.array(file['Bandgap of Donor (eV)']).reshape(i,)
### Acceptor material
acceptor_name = np.array(file['Acceptor']).reshape(i,)
acceptor_smiles = np.array(file['Acceptor SMILES']).reshape(i,)
acceptor_homo = np.array(file['HOMO of Acceptor (eV)']).reshape(i,)
acceptor_lumo = np.array(file['LUMO of Acceptor (eV)']).reshape(i,)
acceptor_bandgap = np.array(file['Bandgap of Acceptor (eV)']).reshape(i,)

homo_offset = donor_homo-acceptor_homo
lumo_offset = donor_lumo-acceptor_lumo

### Device performance
pce = np.array(file['PCE (%)']).reshape(i,)
voc = np.array(file['Voc (V)']).reshape(i,)
jsc = np.array(file['Jsc (mAcm-2)']).reshape(i,)
ff = np.array(file['FF']).reshape(i,)

In [None]:
radius = 5
number_of_bits = 8192

In [None]:
def MF_transfer(smiles):
    bi = {}
    MF = []
    for i, each_smiles in enumerate(smiles):
        try:
            mol = Chem.MolFromSmiles(each_smiles)
            fp=AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=number_of_bits, bitInfo=bi)
            fp_arr = np.array(fp)
            MF.append(fp_arr)
        except:
            print(f'error smiles index: {index[i]}')
    MF = np.array(MF)
    return MF

In [None]:
donor_MF = MF_transfer(donor_smiles)
acceptor_MF = MF_transfer(acceptor_smiles)
MF = np.hstack((donor_MF,acceptor_MF))
print(MF.shape)

In [None]:
donor_MF = np.load('./donor_MF.npy')
acceptor_MF = np.load('./acceptor_MF.npy')
MF = np.load('./MF.npy')

In [None]:
from kneed import KneeLocator

In [None]:
def cluster_test_sil(MF, test_range, name):
    sse = []
    for k in test_range:
        kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
        kmeans.fit(MF)
        sse.append(kmeans.inertia_)
    plt.style.use('fivethirtyeight')
    plt.plot(test_range, sse)
    plt.xticks(test_range)
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.savefig(f'./cluster/{name}_cluster_number_test.jpg', bbox_inches='tight')
    plt.show()
    kl = KneeLocator(test_range, sse, curve='convex', direction='decreasing')
    print(f'suggested cluster number:{kl.elbow}')

In [None]:
def cluster_test(x, test_range, name):
    sse = []
    silhouette_avg = []
    for k in test_range:
        kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
        kmeans_fit = kmeans.fit(x)
        sse.append(kmeans.inertia_)
        silhouette_avg.append(silhouette_score(x, kmeans_fit.labels_))
    plt.style.use('fivethirtyeight')
    plt.plot(test_range, sse)
    plt.xticks(test_range)
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.savefig(f'./cluster/{name}_cluster_number_test.jpg', bbox_inches='tight')
    plt.show()
    plt.style.use('fivethirtyeight')
    plt.plot(test_range, silhouette_avg)
    plt.xticks(test_range)
    plt.xlabel('Number of Clusters')
    plt.ylabel('silhouette score')
    plt.savefig(f'./cluster/{name}_cluster_number_test_sil.jpg', bbox_inches='tight')
    plt.show()
    kl_sse = KneeLocator(test_range, sse, curve='convex', direction='decreasing')
    kl_sil_in = KneeLocator(test_range, silhouette_avg, curve='concave', direction='increasing')
    kl_sil_de = KneeLocator(test_range, silhouette_avg, curve='convex', direction='decreasing')
    print(f'suggested cluster number (sse):{kl_sse.elbow}')
    print(f'suggested cluster number (sil_increasing):{kl_sil_in.elbow}')
    print(f'suggested cluster number (sil_decreasing):{kl_sil_de.elbow}')

In [None]:
cluster_test(MF, range(2,31), 'Device')

In [None]:
cluster_test(donor_MF, range(2,31), 'Donor')

In [None]:
cluster_test(acceptor_MF, range(2,31), 'Acceptor')

In [None]:
from matplotlib.legend import Legend

In [None]:
def material_clustered(MF, n_clusters, name):
    preprocessor = Pipeline([("scaler", MinMaxScaler()),("pca", PCA(n_components=2, random_state=22))])
    clusterer = Pipeline([("kmeans",KMeans(n_clusters=n_clusters,init="k-means++",n_init=50,max_iter=500,random_state=22))])
    pipe = Pipeline([("preprocessor", preprocessor),("clusterer", clusterer)])
    
    pipe.fit(MF)
    preprocessed_data = pipe["preprocessor"].transform(MF)
    predicted_labels = pipe["clusterer"]["kmeans"].labels_
    print(f'silhouette score:{silhouette_score(preprocessed_data, predicted_labels)}')
    
    pcadf = pd.DataFrame(pipe["preprocessor"].transform(MF),columns=["component 1", "component 2"])
    pcadf["Predicted Cluster"] = pipe["clusterer"]["kmeans"].labels_

    sns.set_theme(style="white")
    plt.figure(figsize=(6, 6))
    font = {'family': 'Arial', 'weight': 'normal', 'size': 8}
    plt.rc('font', **font)
    scat = sns.scatterplot(x="component 1",y="component 2",s=40,data=pcadf,hue="Predicted Cluster", style="Predicted Cluster", palette='tab10')
    
    ax = plt.gca()
    ax.grid(True)  # 關閉格線
    
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    legend = ax.get_legend()
    ax.legend_.remove()
    
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_linewidth(1.5)
    ax.tick_params(axis='both', which='major', length=7, width=1.25, direction='in')
    #plt.legend(bbox_to_anchor=(1.02, 1),loc='upper left',borderaxespad=0.,frameon=False,title='Predicted Cluster')
    
    plt.xlabel("Component 1 (Acceptor-dominated)")
    plt.ylabel("Component 2 (Donor-dominated)")

    plt.savefig(f'./cluster/{name}.png', bbox_inches='tight', dpi=600, facecolor='white')
    plt.show()
    
    # 抽出原 legend
    

    # 建立新 figure 專門放 legend
    fig_leg = plt.figure(figsize=(1, 2))  # 可調整大小
    plt.rc('font', **font)
    ax_leg = fig_leg.add_subplot(111)
    ax_leg.axis("off")

    # 複製原 legend 到新圖中
    new_leg = Legend(ax_leg, legend.legendHandles, [t.get_text() for t in legend.get_texts()],
                     loc='center', frameon=False, handlelength=2, ncol=1)
    ax_leg.add_artist(new_leg)

    # 儲存圖像
    fig_leg.savefig(f'./cluster/{name}_legend.png', dpi=600, bbox_inches='tight', facecolor='white')
    plt.close(fig_leg)
    return pcadf

In [None]:
device_cluster = material_clustered(MF, 7, 'Device7')

In [None]:
device_cluster

In [None]:
display(device_cluster.value_counts('Predicted Cluster'))

In [None]:
donor_cluster = material_clustered(donor_MF, 9, 'Donor9')

In [None]:
display(donor_cluster.value_counts('Predicted Cluster'))

In [None]:
acceptor_cluster = material_clustered(acceptor_MF, 6, 'Acceptor6')

In [None]:
display(acceptor_cluster.value_counts('Predicted Cluster'))

In [None]:
device_c = device_cluster['predicted_cluster']
donor_c = donor_cluster['predicted_cluster']
acceptor_c = acceptor_cluster['predicted_cluster']

file['Device cluster'] = device_c.tolist()
file['Donor cluster'] = donor_c.tolist()
file['Acceptor cluster'] = acceptor_c.tolist()

In [None]:
file.to_excel('./clustered_opv_ai_database_used.xlsx')

In [None]:
E_Voc = acceptor_lumo - donor_homo

In [None]:
df = pd.DataFrame({'HOMO of Donor (eV)':donor_homo, 'LUMO of Donor (eV)':donor_lumo,
                   'Bandgape of Donor (eV)':donor_bandgap,
                   'HOMO of Acceptor (eV)':acceptor_homo, 'LUMO of Acceptor (eV)':acceptor_lumo,
                   'Bandgap of Acceptor (eV)':acceptor_bandgap,
                   'HOMO offset (eV)':homo_offset, 'LUMO offset (eV)':lumo_offset,'E_Voc (eV)':E_Voc,
                   'PCE (%)':pce, 'Voc (V)':voc, '$Jsc\ (mAcm^{-2})$':jsc, 'FF':ff,
                  'Device cluster':device_c, 'Donor cluster':donor_c, 'Acceptor cluster':acceptor_c})
df_col_list = ['HOMO of Donor (eV)', 'LUMO of Donor (eV)', 'Bandgape of Donor (eV)', 'HOMO of Acceptor (eV)', 'LUMO of Acceptor (eV)',
                   'Bandgap of Acceptor (eV)', 'HOMO offset (eV)', 'LUMO offset (eV)','E_Voc (eV)',
                   'PCE (%)', 'Voc (V)', '$Jsc\ (mAcm^{-2})$', 'FF']

In [None]:
for x_col in df_col_list:
    for y_col in df_col_list:
        for z_col in df_col_list:
            if x_col==y_col or x_col==z_col or y_col==z_col:
                pass
            else:
                x = df[x_col]
                y = df[y_col]
                z = df[z_col]
                f, ax = plt.subplots()
                points = ax.scatter(x, y, c=z, s=50, cmap='plasma')
                f.colorbar(points)
                plt.xlabel(x_col)
                plt.ylabel(y_col)
                plt.title(z_col)
                plt.show()

In [None]:
plt.style.use("default")

In [None]:
from scipy.interpolate import griddata
import matplotlib.colors as colors

In [None]:
direction = './statistics/gradient'
if not os.path.exists(direction):
    os.mkdir(direction)
else:
    shutil.rmtree(direction)
    os.mkdir(direction)
direction = './statistics/gradient/scatter'
if not os.path.exists(direction):
    os.mkdir(direction)
else:
    shutil.rmtree(direction)
    os.mkdir(direction)
direction = './statistics/gradient/contour'
if not os.path.exists(direction):
    os.mkdir(direction)
else:
    shutil.rmtree(direction)
    os.mkdir(direction)

In [None]:
def grad_scatter(x,y,z,name):
    f, ax = plt.subplots()
    points = ax.scatter(x, y, c=z, s=50, cmap='plasma')
    f.colorbar(points)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(z_col)
    plt.savefig(f'./statistics/gradient/scatter/scatter_{i}.jpg', bbox_inches='tight')
    #plt.show()

In [None]:
def grad_contour(x,y,z, name):
    xi = np.linspace(min(x), max(x), 35)
    yi = np.linspace(min(y), max(y), 35)
    X, Y = np.meshgrid(xi, yi)

    Z = griddata((x, y), z, (X, Y), method='cubic')

    bwr_grey = plt.cm.get_cmap('bwr')
    newcolors = bwr_grey(np.linspace(0, 1, 256))
    newcolors[(newcolors[:,0] > 0.9) & (newcolors[:,1] > 0.9) & (newcolors[:,2] > 0.9)] = (0.9, 0.9, 0.9, 1.0)
    newcmp = colors.ListedColormap(newcolors)

    fig, ax = plt.subplots(figsize=(7,6))
    cs = ax.contour(X, Y, Z)
    cf = ax.contourf(X, Y, Z, cmap=newcmp)
    ax.clabel(cs, cs.levels, inline=True, fmt = '%1.1f', fontsize=10, colors='r')
    fig.colorbar(cf, ax=ax, ticks=MaxNLocator(5))
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(z_col)
    plt.savefig(f'./statistics/gradient/contour/contour_{i}.jpg', bbox_inches='tight')
    #plt.show()

In [None]:
i = 0
for x_col in df_col_list:
    for y_col in df_col_list:
        for z_col in df_col_list:
            if x_col==y_col or x_col==z_col or y_col==z_col:
                pass
            else:
                i += 1
                x = df[x_col]
                y = df[y_col]
                z = df[z_col]
                grad_scatter(x,y,z, name=i)
                next

In [None]:
i = 0
for x_col in df_col_list:
    for y_col in df_col_list:
        for z_col in df_col_list:
            if x_col==y_col or x_col==z_col or y_col==z_col:
                pass
            else:
                i += 1
                x = df[x_col]
                y = df[y_col]
                z = df[z_col]
                grad_contour(x,y,z, name=i)
                next

In [None]:
x = df["HOMO of Donor (eV)"]
y = df["LUMO of Donor (eV)"]
z = df["PCE (%)"]


xi = np.linspace(min(x), max(x), 35)
yi = np.linspace(min(y), max(y), 35)
X, Y = np.meshgrid(xi, yi)

Z = griddata((x, y), z, (X, Y), method='cubic')

bwr_grey = plt.cm.get_cmap('bwr')
newcolors = bwr_grey(np.linspace(0, 1, 256))
newcolors[128] = (0.9, 0.9, 0.9, 1.0)
newcmp = colors.ListedColormap(newcolors)

fig, ax = plt.subplots(figsize=(7,6))
cs = ax.contour(X, Y, Z)
cf = ax.contourf(X, Y, Z, cmap=newcmp)
ax.clabel(cs, cs.levels, inline=True, fmt = '%1.1f', fontsize=10, colors='r')
fig.colorbar(cf, ax=ax, ticks=MaxNLocator(5))
plt.show()

In [None]:
scat = sns.scatterplot(x="HOMO of Donor (eV)",y="LUMO of Donor (eV)",s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="Bandgape of Donor (eV)",y="Bandgape of Acceptor (eV)",s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="E_Voc (eV)",y='$Jsc\ (mAcm^{-2})$',s=60,data=df,hue="Voc (V)")

In [None]:
scat = sns.scatterplot(x="E_Voc (eV)",y='$Jsc\ (mAcm^{-2})$',s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="Voc (V)",y='$Jsc\ (mAcm^{-2})$',s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="Voc (V)",y='FF',s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="HOMO of Donor (eV)",y="LUMO of Acceptor (eV)",s=60,data=df,hue="PCE (%)")

In [None]:
scat = sns.scatterplot(x="HOMO of Donor (eV)",y="LUMO of Acceptor (eV)",s=60,data=df,hue="Voc (V)")

In [None]:
scat = sns.scatterplot(x="HOMO of Donor (eV)",y="Bandgap of Acceptor (eV)",s=60,data=df,hue="PCE (%)")

In [None]:
### correlation of each two parameters (properties and performacne) with heat color
df = pd.DataFrame({'HOMO of Donor (eV)':donor_homo, 'LUMO of Donor (eV)':donor_lumo,
                   'Bandgape of Donor (eV)':donor_bandgap,
                   'HOMO of Acceptor (eV)':acceptor_homo, 'LUMO of Acceptor (eV)':acceptor_lumo,
                   'Bandgape of Acceptor (eV)':acceptor_bandgap,
                   'HOMO offset (eV)':homo_offset, 'LUMO offset (eV)':lumo_offset,
                   'PCE (%)':pce, 'Voc (V)':voc, '$Jsc\ (mAcm^{-2})$':jsc, 'FF':ff,
                  'Device cluster' : device_c})
### pair plot (scatter plot of each two parameters)
plt.figure(figsize=(6, 6), dpi=300)
plt.rc('font', family='Arial', size=8, weight='normal')
fig = sns.pairplot(df, hue='Device cluster', palette='Set2')
fig.savefig('./cluster/device_pairplot.png', bbox_inches='tight')
plt.show()

In [None]:
### correlation of each two parameters (properties and performacne) with heat color
df = pd.DataFrame({'HOMO of Donor (eV)':donor_homo, 'LUMO of Donor (eV)':donor_lumo,
                   'Bandgape of Donor (eV)':donor_bandgap,
                   'HOMO of Acceptor (eV)':acceptor_homo, 'LUMO of Acceptor (eV)':acceptor_lumo,
                   'Bandgape of Acceptor (eV)':acceptor_bandgap,
                   'HOMO offset (eV)':homo_offset, 'LUMO offset (eV)':lumo_offset,
                   'PCE (%)':pce, 'Voc (V)':voc, '$Jsc\ (mAcm^{-2})$':jsc, 'FF':ff,
                  'Donor cluster' : donor_c})
### pair plot (scatter plot of each two parameters)
plt.figure(figsize=(6, 6), dpi=300)
plt.rc('font', family='Arial', size=8, weight='normal')
fig = sns.pairplot(df, hue='Donor cluster', palette='Set2')
fig.savefig('./cluster/donor_pairplot.png', bbox_inches='tight')
plt.show()

In [None]:
### correlation of each two parameters (properties and performacne) with heat color
df = pd.DataFrame({'HOMO of Donor (eV)':donor_homo, 'LUMO of Donor (eV)':donor_lumo,
                   'Bandgape of Donor (eV)':donor_bandgap,
                   'HOMO of Acceptor (eV)':acceptor_homo, 'LUMO of Acceptor (eV)':acceptor_lumo,
                   'Bandgape of Acceptor (eV)':acceptor_bandgap,
                   'HOMO offset (eV)':homo_offset, 'LUMO offset (eV)':lumo_offset,
                   'PCE (%)':pce, 'Voc (V)':voc, '$Jsc\ (mAcm^{-2})$':jsc, 'FF':ff,
                  'Acceptor cluster' : acceptor_c})
### pair plot (scatter plot of each two parameters)
plt.figure(figsize=(6, 6), dpi=300)
plt.rc('font', family='Arial', size=8, weight='normal')
fig = sns.pairplot(df, hue='Acceptor cluster', palette='Set2')
fig.savefig('./cluster/acceptor_pairplot.png', bbox_inches='tight')
plt.show()

In [None]:
pick_out_cluster = 2
pick_out_list = []

for i, c in enumerate(donor_c):
    if c == pick_out_cluster:
        pick_out_list.append(i)
    else:
        pass
print(len(pick_out_list))

In [None]:
plt.style.use("default")

In [None]:
### scatter plot
plt.figure(figsize=(2.5, 2.5), dpi=300)
plt.rc('font', family='Arial', size=8, weight='normal')
plt.scatter(jsc[pick_out_list], pce[pick_out_list], 1.5)
### linear regression
r = np.corrcoef(jsc[pick_out_list], pce[pick_out_list])[0,1].round(3)
linear_model = np.polyfit(jsc, pce, 1)
print(f'y = {round(linear_model[0], 4)} x + {round(linear_model[1], 4)}')
linear_model_fn = np.poly1d(linear_model)
x_s = np.arange(0,36,0.1)
plt.plot(x_s, linear_model_fn(x_s), "r--", linewidth=1)
### coordinate axis
plt.xlim(0,35)
plt.ylim(0,20)
plt.xlabel('$Jsc\ (mAcm^{-2})$', fontdict={'family':'Arial','size':8,'weight':'normal'})
plt.ylabel('PCE (%)', fontdict={'family':'Arial','size':8,'weight':'normal'})
### locator
ax = plt.gca()
ax.set_aspect('auto', adjustable='box', anchor='C')
#ax.xaxis.set_major_locator(MaxNLocator(5))
ax.yaxis.set_major_locator(MaxNLocator(4))
ax.xaxis.set_minor_locator(AutoMinorLocator(5))
ax.yaxis.set_minor_locator(AutoMinorLocator(5))
for axis in ['top', 'bottom', 'left', 'right']:
    ax.spines[axis].set_linewidth(1.5)
ax.tick_params(axis='both', which='major', width=1.5, labelsize=8, direction='in')
ax.tick_params(axis='both', which='minor', length=3, width=1, direction='in')
print(r)
#plt.savefig('./statistics/jsc_pce.png', bbox_inches='tight')

In [None]:
### scatter plot
E_Voc = acceptor_lumo[pick_out_list] - donor_homo[pick_out_list]
plt.figure(figsize=(2.5, 2.5), dpi=300)
plt.rc('font', family='Arial', size=8, weight='normal')
plt.scatter(E_Voc, voc[pick_out_list], 1.5)
### linear regression
r = np.corrcoef(E_Voc, voc[pick_out_list])[0,1].round(3)
linear_model = np.polyfit(E_Voc, voc[pick_out_list], 1)
print(f'y = {round(linear_model[0], 4)} x + {round(linear_model[1], 4)}')
linear_model_fn = np.poly1d(linear_model)
x_s = np.arange(0,4)
plt.plot(x_s, linear_model_fn(x_s), "r--", linewidth=1)
### coordinate axis
plt.xlim(0,3)
plt.ylim(0,1.4)
plt.xlabel('Energy Difference between donor HOMO and acceptor LUMO (eV)', fontdict={'family':'Arial','size':8,'weight':'normal'})
plt.ylabel('Voc (V)', fontdict={'family':'Arial','size':8,'weight':'normal'})
### locator
ax = plt.gca()
ax.set_aspect('auto', adjustable='box', anchor='C')
ax.xaxis.set_minor_locator(AutoMinorLocator(5))
ax.yaxis.set_minor_locator(AutoMinorLocator(2))
for axis in ['top', 'bottom', 'left', 'right']:
    ax.spines[axis].set_linewidth(1.5)
ax.tick_params(axis='both', which='major', width=1.5, labelsize=8, direction='in')
ax.tick_params(axis='both', which='minor', length=3, width=1, direction='in')
print(r)
#plt.savefig('./statistics/energy_gap_voc.png', bbox_inches='tight')

In [None]:
for pick_out_cluster in range(6):
    pick_out_list = []

    for i, c in enumerate(acceptor_c):
        if c == pick_out_cluster:
            pick_out_list.append(i)
        else:
            pass
    print(f'cluster number: {pick_out_cluster}')
    print(len(pick_out_list))
    
    ### scatter plot
    plt.figure(figsize=(2.5, 2.5), dpi=300)
    plt.rc('font', family='Arial', size=8, weight='normal')
    plt.scatter(jsc[pick_out_list], pce[pick_out_list], 1.5)
    ### linear regression
    r = np.corrcoef(jsc[pick_out_list], pce[pick_out_list])[0,1].round(3)
    linear_model = np.polyfit(jsc, pce, 1)
    print(f'y = {round(linear_model[0], 4)} x + {round(linear_model[1], 4)}')
    linear_model_fn = np.poly1d(linear_model)
    x_s = np.arange(0,36,0.1)
    plt.plot(x_s, linear_model_fn(x_s), "r--", linewidth=1)
    ### coordinate axis
    plt.xlim(0,35)
    plt.ylim(0,20)
    plt.xlabel('$Jsc\ (mAcm^{-2})$', fontdict={'family':'Arial','size':8,'weight':'normal'})
    plt.ylabel('PCE (%)', fontdict={'family':'Arial','size':8,'weight':'normal'})
    ### locator
    ax = plt.gca()
    ax.set_aspect('auto', adjustable='box', anchor='C')
    #ax.xaxis.set_major_locator(MaxNLocator(5))
    ax.yaxis.set_major_locator(MaxNLocator(4))
    ax.xaxis.set_minor_locator(AutoMinorLocator(5))
    ax.yaxis.set_minor_locator(AutoMinorLocator(5))
    for axis in ['top', 'bottom', 'left', 'right']:
        ax.spines[axis].set_linewidth(1.5)
    ax.tick_params(axis='both', which='major', width=1.5, labelsize=8, direction='in')
    ax.tick_params(axis='both', which='minor', length=3, width=1, direction='in')
    print(r)
    #plt.savefig('./statistics/jsc_pce.png', bbox_inches='tight')
    
    ### scatter plot
    plt.figure(figsize=(2.5, 2.5), dpi=300)
    plt.rc('font', family='Arial', size=8, weight='normal')
    plt.scatter(ff[pick_out_list], pce[pick_out_list], 1.5)
    ### linear regression
    r = np.corrcoef(ff[pick_out_list], pce[pick_out_list])[0,1].round(3)
    linear_model = np.polyfit(ff, pce, 1)
    print(f'y = {round(linear_model[0], 4)} x + {round(linear_model[1], 4)}')
    linear_model_fn = np.poly1d(linear_model)
    x_s = np.arange(0,1.1,0.1)
    plt.plot(x_s, linear_model_fn(x_s), "r--", linewidth=1)
    ### coordinate axis
    plt.xlim(0,1)
    plt.ylim(0,20)
    plt.xlabel('FF', fontdict={'family':'Arial','size':8,'weight':'normal'})
    plt.ylabel('PCE (%)', fontdict={'family':'Arial','size':8,'weight':'normal'})
    ### locator
    ax = plt.gca()
    ax.set_aspect('auto', adjustable='box', anchor='C')
    #ax.xaxis.set_major_locator(MaxNLocator(5))
    ax.yaxis.set_major_locator(MaxNLocator(4))
    ax.xaxis.set_minor_locator(AutoMinorLocator(2))
    ax.yaxis.set_minor_locator(AutoMinorLocator(5))
    for axis in ['top', 'bottom', 'left', 'right']:
        ax.spines[axis].set_linewidth(1.5)
    ax.tick_params(axis='both', which='major', width=1.5, labelsize=8, direction='in')
    ax.tick_params(axis='both', which='minor', length=3, width=1, direction='in')
    print(r)
    #plt.savefig('./statistics/ff_pce.png', bbox_inches='tight')
    
    ### scatter plot
    E_Voc = acceptor_lumo[pick_out_list] - donor_homo[pick_out_list]
    plt.figure(figsize=(2.5, 2.5), dpi=300)
    plt.rc('font', family='Arial', size=8, weight='normal')
    plt.scatter(E_Voc, voc[pick_out_list], 1.5)
    ### linear regression
    r = np.corrcoef(E_Voc, voc[pick_out_list])[0,1].round(3)
    linear_model = np.polyfit(E_Voc, voc[pick_out_list], 1)
    print(f'y = {round(linear_model[0], 4)} x + {round(linear_model[1], 4)}')
    linear_model_fn = np.poly1d(linear_model)
    x_s = np.arange(0,4)
    plt.plot(x_s, linear_model_fn(x_s), "r--", linewidth=1)
    ### coordinate axis
    plt.xlim(0,3)
    plt.ylim(0,1.4)
    plt.xlabel('Energy Difference between donor HOMO and acceptor LUMO (eV)', fontdict={'family':'Arial','size':8,'weight':'normal'})
    plt.ylabel('Voc (V)', fontdict={'family':'Arial','size':8,'weight':'normal'})
    ### locator
    ax = plt.gca()
    ax.set_aspect('auto', adjustable='box', anchor='C')
    ax.xaxis.set_minor_locator(AutoMinorLocator(5))
    ax.yaxis.set_minor_locator(AutoMinorLocator(2))
    for axis in ['top', 'bottom', 'left', 'right']:
        ax.spines[axis].set_linewidth(1.5)
    ax.tick_params(axis='both', which='major', width=1.5, labelsize=8, direction='in')
    ax.tick_params(axis='both', which='minor', length=3, width=1, direction='in')
    print(r)
    #plt.savefig('./statistics/energy_gap_voc.png', bbox_inches='tight')