In [30]:
#################################################### Dataset Analysis ############################
import pandas as pd
import re

# Load the dataset
file_path = 'Final_Formated_and_cleaned_file_With_Features.csv'
df = pd.read_csv(file_path)

# Function to get element count in a composition
def count_elements(composition):
    return len(re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition))

# Get unique phases and their counts
phase_counts = df['Phase'].value_counts()

# Get the percentage of each phase
total_instances = len(df)
phase_percentage = (phase_counts / total_instances) * 100

# Get the phase with the maximum instances
max_phase = phase_counts.idxmax()
max_phase_count = phase_counts.max()

# Get the total number of features (excluding 'composition' and 'Phase' columns)
total_features = df.shape[1] - 2

# Get the number of compositions containing N elements
composition_counts = df['composition'].apply(count_elements).value_counts().sort_index()

phase_counts, phase_percentage, total_instances, max_phase, max_phase_count, total_features, composition_counts


# Write the statistics to a text file
statistics_text = """
### Dataset Statistics

#### General Information
- **Total Instances**: 1184
- **Total Features**: 14

#### Phase Statistics
- **Unique Phases**: 8
  - **BCC**: 386 instances (32.60%)
  - **FCC**: 352 instances (29.73%)
  - **BCC+Sec**: 123 instances (10.39%)
  - **FCC+Sec**: 99 instances (8.36%)
  - **FCC+BCC**: 71 instances (6.00%)
  - **FCC+BCC+Sec**: 58 instances (4.90%)
  - **HCP**: 54 instances (4.56%)
  - **Sec**: 41 instances (3.46%)

- **Phase with Maximum Instances**: BCC (386 instances)

#### Composition Statistics
- **Compositions Containing N Elements**:
  - **2 Elements**: 425 compositions
  - **3 Elements**: 70 compositions
  - **4 Elements**: 86 compositions
  - **5 Elements**: 315 compositions
  - **6 Elements**: 221 compositions
  - **7 Elements**: 59 compositions
  - **8 Elements**: 4 compositions
  - **9 Elements**: 4 compositions
"""

# Save to a text file
statistics_file_path = 'dataset_statistics.txt'
with open(statistics_file_path, 'w') as f:
    f.write(statistics_text)

statistics_file_path


'dataset_statistics.txt'

In [31]:
# Tensorboard  (optional)
#sto comand prompt
#taskkill /im tensorboard.exe /f
#del /q %TMP%\.tensorboard-info\*
%load_ext tensorboard
#%tensorboard --logdir runs/train
%tensorboard --logdir ./logs
#%tensorboard --logdir {logs_base_dir}  --host localhost

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 4976), started 0:07:52 ago. (Use '!kill 4976' to kill it.)

In [18]:
################################################  Esembled Model All Layers Fine Tune  #############################
import joblib

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
import torch
from torch.utils.data import Dataset
import re

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainerCallback, IntervalStrategy


import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import os

import scipy.special

import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([f"{element}{fraction}" for element, fraction in sorted_matches])

# Load the data
data = pd.read_csv('Final_Formated_and_cleaned_file_No_Features.csv')

# Tokenize and normalize
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)
label_encoder = LabelEncoder()
data['encoded_phase'] = label_encoder.fit_transform(data['Phase'])

feature_columns = [col for col in data.columns if col not in ['composition', 'Phase', 'tokenized_elements', 'encoded_phase']]
for feature in feature_columns:
    scaler = StandardScaler()
    data[f'normalized_{feature}'] = scaler.fit_transform(data[[feature]])

data['combined_features'] = data['tokenized_elements'] + ' ' + data[[f'normalized_{feature}' for feature in feature_columns]].astype(str).agg(' '.join, axis=1)

# Split data
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

# # Initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained('./results/pretrained_BERT_200k')
model = AutoModelForSequenceClassification.from_pretrained('./results/pretrained_BERT_200k', num_labels=len(label_encoder.classes_))

# Tokenize
train_encodings = tokenizer(data_train['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(data_test['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, data_train['encoded_phase'].values)
test_dataset = CustomDataset(test_encodings, data_test['encoded_phase'].values)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

# optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataset) * 30
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

current_time = datetime.now().strftime('%b%d_%H-%M-%S')
log_dir = './logs/' + current_time

training_args = TrainingArguments(
    output_dir='d:',
    num_train_epochs=9,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir=log_dir,
    logging_steps=1,
    save_steps=1,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    report_to='tensorboard'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)

trainer.train()

results = trainer.evaluate()
print(results)

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

class_report_str = classification_report(predictions.label_ids, pred_labels, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report_str)

with open('./results/classification_report.txt', 'w') as f:
    f.write(class_report_str)

conf_mat = confusion_matrix(predictions.label_ids, pred_labels)
print("Confusion Matrix:", conf_mat)

with open('./results/confusion_matrix.txt', 'w') as f:
    np.savetxt(f, conf_mat, fmt='%d')

plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('./results/confusion_matrix.png', dpi=300)
plt.show()

tokenizer.save_pretrained('./results')
model.save_pretrained('./results')


#################################### Random Forest #######################################
# Load the data
df = pd.read_csv('Final_Formated_and_cleaned_file_With_Features.csv')
# df = pd.read_csv('Fine Tuning Medium and HEA_Rounded_Cleaned_Acoustics.csv')

def get_element_fraction(composition, element):
    if element in composition:
        # Extract the portion of the string after the element's name
        remainder = composition[composition.index(element) + len(element):]
        
        # Extract the coefficient using regex
        import re
        match = re.search(r"(\d+(\.\d+)?)", remainder)
        if match:
            return float(match.group(1))
    return 0.0

# Create columns for each metallic element and fill with its coefficient
METALLIC_ELEMENTS = ["Li", "Be", "B", "C", "N", "O", "Na", "Mg", "Al", "Si", "P", "S",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "Se",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Te",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm",
    "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Tl", "Pb", "Po", "Th", "Pa", "U"]

for element in METALLIC_ELEMENTS:
    df[element] = df['composition'].apply(lambda x: get_element_fraction(x, element))


# Drop the original 'composition' column and 'hardness' column
X = df.drop(columns=['composition', 'Phase'])
y = df['Phase']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=60, random_state=42)
clf.fit(X_train, y_train)

# Save the Random Forest model
joblib.dump(clf, './results/random_forest_model.pkl')

# Predict on the test set
y_pred = clf.predict(X_test)

# BERT Predictions
bert_predictions = trainer.predict(test_dataset)
bert_pred_labels = np.argmax(bert_predictions.predictions, axis=1)
bert_pred_probs = scipy.special.softmax(bert_predictions.predictions, axis=1)  # Converting logits to probabilities

# Transform BERT labels back to original (string) labels
bert_pred_labels_str = label_encoder.inverse_transform(bert_pred_labels)

# Random Forest Predictions
rf_pred_labels = clf.predict(X_test)
rf_pred_probs = clf.predict_proba(X_test)

# Make sure that both y_test and final_predictions are of the same type (either both numbers or both strings)
# Here, converting everything to string type
y_test_str = y_test.astype(str)
rf_pred_labels_str = rf_pred_labels.astype(str)

# Weighted Voting
bert_weight = 0.3
rf_weight = 0.7
final_predictions_weighted = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    avg_prob = b_prob * bert_weight + r_prob * rf_weight
    final_prediction = np.argmax(avg_prob)
    final_predictions_weighted.append(label_encoder.classes_[final_prediction])

# Soft Voting with Probabilities
final_predictions_soft = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    avg_prob = (b_prob + r_prob) / 2
    final_prediction = np.argmax(avg_prob)
    final_predictions_soft.append(label_encoder.classes_[final_prediction])

# Confidence-based Voting
final_predictions_confidence = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    b_confidence = np.max(b_prob)
    r_confidence = np.max(r_prob)
    final_prediction = np.argmax(b_prob) if b_confidence > r_confidence else np.argmax(r_prob)
    final_predictions_confidence.append(label_encoder.classes_[final_prediction])

# Assume final_predictions holds the ensemble predictions you want to evaluate
final_predictions = final_predictions_confidence  # or final_predictions_soft or final_predictions_confidence

# Generate the confusion matrix
final_cm = confusion_matrix(y_test_str, final_predictions)

# Calculate accuracy from the confusion matrix
final_accuracy = np.trace(final_cm) / np.sum(final_cm)

# Print the calculated accuracy
print("Final Model Accuracy:", final_accuracy)

# Debug Step 4: Check Confusion Matrix Labels
sorted_labels = sorted(y_test.unique())
plt.figure(figsize=(10, 7))
sns.heatmap(final_cm, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=sorted_labels, yticklabels=sorted_labels)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Ensemble Model Confusion Matrix')
plt.show()

# Evaluate the different ensemble methods
print("Weighted Voting Accuracy:", accuracy_score(y_test_str, final_predictions_weighted))
print("Soft Voting Accuracy:", accuracy_score(y_test_str, final_predictions_soft))
print("Confidence-based Voting Accuracy:", accuracy_score(y_test_str, final_predictions_confidence))


# Classification Report
print("\nEnsemble Model Classification Report:\n", classification_report(y_test, final_predictions))




HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './results/pretrained_BERT_200k'. Use `repo_type` argument if needed.

In [None]:
######################################### Frozen Layers Training ############################
import joblib

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
import torch
from torch.utils.data import Dataset
import re

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainerCallback, IntervalStrategy


import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import os

import scipy.special

import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([f"{element}{fraction}" for element, fraction in sorted_matches])

# Load the data
data = pd.read_csv('Final_Formated_and_cleaned_file_With_Features_Rounded_Cleaned_Acoustics.csv')

# Tokenize and normalize
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)
label_encoder = LabelEncoder()
data['encoded_phase'] = label_encoder.fit_transform(data['Phase'])

feature_columns = [col for col in data.columns if col not in ['composition', 'Phase', 'tokenized_elements', 'encoded_phase']]
for feature in feature_columns:
    scaler = StandardScaler()
    data[f'normalized_{feature}'] = scaler.fit_transform(data[[feature]])

data['combined_features'] = data['tokenized_elements'] + ' ' + data[[f'normalized_{feature}' for feature in feature_columns]].astype(str).agg(' '.join, axis=1)

# Split data
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('./results/pretrained_BERT_200K')
model = AutoModelForSequenceClassification.from_pretrained('./results/pretrained_BERT_200K', num_labels=len(label_encoder.classes_))


# Tokenize
train_encodings = tokenizer(data_train['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(data_test['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, data_train['encoded_phase'].values)
test_dataset = CustomDataset(test_encodings, data_test['encoded_phase'].values)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

# Initialize custom optimizer with weight decay for specific layers
decay_layers = ["1", "5", "9", "12"]
decay_param_names = [n for n, p in model.named_parameters() if any(f".{layer}." in n for layer in decay_layers)]
no_decay_param_names = [n for n, p in model.named_parameters() if n not in decay_param_names]
decay_params = [p for n, p in model.named_parameters() if n in decay_param_names]
no_decay_params = [p for n, p in model.named_parameters() if n in no_decay_param_names]
optimizer_grouped_parameters = [
    {"params": decay_params, "weight_decay": 0.04},
    {"params": no_decay_params, "weight_decay": 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)

# Scheduler
num_training_steps = len(train_dataset) * 30
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

current_time = datetime.now().strftime('%b%d_%H-%M-%S')
log_dir = './logs/' + current_time

training_args = TrainingArguments(
    output_dir='d:',
    num_train_epochs=13,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir=log_dir,
    logging_steps=1,
    save_steps=1000,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    report_to='tensorboard'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)

trainer.train()

results = trainer.evaluate()
print(results)

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

class_report_str = classification_report(predictions.label_ids, pred_labels, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report_str)

with open('./results/classification_report.txt', 'w') as f:
    f.write(class_report_str)

conf_mat = confusion_matrix(predictions.label_ids, pred_labels)
print("Confusion Matrix:", conf_mat)

with open('./results/confusion_matrix.txt', 'w') as f:
    np.savetxt(f, conf_mat, fmt='%d')

plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('./results/confusion_matrix.png', dpi=300)
plt.show()

tokenizer.save_pretrained('d:')
model.save_pretrained('d:')


# Load the data
df = pd.read_csv('Final_Formated_and_cleaned_file_With_Features_Rounded_Cleaned_Acoustics.csv')
# df = pd.read_csv('Fine Tuning Medium and HEA_Rounded_Cleaned.csv')

def get_element_fraction(composition, element):
    if element in composition:
        # Extract the portion of the string after the element's name
        remainder = composition[composition.index(element) + len(element):]
        
        # Extract the coefficient using regex
        import re
        match = re.search(r"(\d+(\.\d+)?)", remainder)
        if match:
            return float(match.group(1))
    return 0.0

# Create columns for each metallic element and fill with its coefficient
METALLIC_ELEMENTS = ["Li", "Be", "B", "C", "N", "O", "Na", "Mg", "Al", "Si", "P", "S",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "Se",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Te",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm",
    "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Tl", "Pb", "Po", "Th", "Pa", "U"]

for element in METALLIC_ELEMENTS:
    df[element] = df['composition'].apply(lambda x: get_element_fraction(x, element))


# Drop the original 'composition' column and 'hardness' column
X = df.drop(columns=['composition', 'Phase'])
y = df['Phase']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the Random Forest model
joblib.dump(clf, './results/random_forest_model.pkl')

# Predict on the test set
y_pred = clf.predict(X_test)

# BERT Predictions
bert_predictions = trainer.predict(test_dataset)
bert_pred_labels = np.argmax(bert_predictions.predictions, axis=1)
bert_pred_probs = scipy.special.softmax(bert_predictions.predictions, axis=1)  # Converting logits to probabilities

# Transform BERT labels back to original (string) labels
bert_pred_labels_str = label_encoder.inverse_transform(bert_pred_labels)

# Random Forest Predictions
rf_pred_labels = clf.predict(X_test)
rf_pred_probs = clf.predict_proba(X_test)

# Make sure that both y_test and final_predictions are of the same type (either both numbers or both strings)
# Here, converting everything to string type
y_test_str = y_test.astype(str)
rf_pred_labels_str = rf_pred_labels.astype(str)

# Weighted Voting
bert_weight = 0.3
rf_weight = 0.7
final_predictions_weighted = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    avg_prob = b_prob * bert_weight + r_prob * rf_weight
    final_prediction = np.argmax(avg_prob)
    final_predictions_weighted.append(label_encoder.classes_[final_prediction])

# Soft Voting with Probabilities
final_predictions_soft = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    avg_prob = (b_prob + r_prob) / 2
    final_prediction = np.argmax(avg_prob)
    final_predictions_soft.append(label_encoder.classes_[final_prediction])

# Confidence-based Voting
final_predictions_confidence = []

for b_prob, r_prob in zip(bert_pred_probs, rf_pred_probs):
    b_confidence = np.max(b_prob)
    r_confidence = np.max(r_prob)
    final_prediction = np.argmax(b_prob) if b_confidence > r_confidence else np.argmax(r_prob)
    final_predictions_confidence.append(label_encoder.classes_[final_prediction])

# Assume final_predictions holds the ensemble predictions you want to evaluate
final_predictions = final_predictions_weighted  # or final_predictions_soft or final_predictions_confidence

# Generate the confusion matrix
final_cm = confusion_matrix(y_test_str, final_predictions)

# Calculate accuracy from the confusion matrix
final_accuracy = np.trace(final_cm) / np.sum(final_cm)

# Print the calculated accuracy
print("Final Model Accuracy:", final_accuracy)


# Debug Step 4: Check Confusion Matrix Labels
sorted_labels = sorted(y_test.unique())
plt.figure(figsize=(10, 7))
sns.heatmap(final_cm, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=sorted_labels, yticklabels=sorted_labels)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Ensemble Model Confusion Matrix')
plt.show()

# Evaluate the different ensemble methods
print("Weighted Voting Accuracy:", accuracy_score(y_test_str, final_predictions_weighted))
print("Soft Voting Accuracy:", accuracy_score(y_test_str, final_predictions_soft))
print("Confidence-based Voting Accuracy:", accuracy_score(y_test_str, final_predictions_confidence))


# Classification Report
print("\nEnsemble Model Classification Report:\n", classification_report(y_test, final_predictions))


In [None]:
####################################  Heat Maps ############################
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import re

# Function to tokenize and sort elements
def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([''.join(pair) for pair in sorted_matches])

# Load your trained model and tokenizer
model_path = './results'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, output_attentions=True)

# Ensure model is in evaluation mode
model.eval()

# Original composition
composition = "Co1 Cr1 Fe1 Mn1 Ni1 V1"
tokenized_composition = custom_tokenize(composition)

# Tokenize
inputs = tokenizer(tokenized_composition, return_tensors="pt", add_special_tokens=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Get Attention Weights
outputs = model(**inputs)
attentions = outputs.attentions  # List of attention weights for each layer

# Aggregate across all heads for simplicity (could be refined)
avg_attention = attentions[-1].squeeze(0).mean(0).detach().numpy()

# Exclude the [CLS] and [SEP] tokens
avg_attention = avg_attention[1:-1, 1:-1]

# Average the attention values for element pairs
element_count = len(re.findall(r'([A-Z][a-z]*[0-9.]+)', tokenized_composition))
avg_attention_relevant = np.zeros((element_count, element_count))

for i in range(0, avg_attention.shape[0], 2):
    for j in range(0, avg_attention.shape[1], 2):
        avg_value = np.mean(avg_attention[i:i+2, j:j+2])
        avg_attention_relevant[i//2, j//2] = avg_value

# Symmetrize the matrix
avg_attention_symmetric = (avg_attention_relevant + avg_attention_relevant.T) / 2

# Visualize
labels = custom_tokenize(composition).split()
sns.heatmap(avg_attention_symmetric, annot=True, xticklabels=labels, yticklabels=labels)
plt.show()

# Visualize
labels = custom_tokenize(composition).split()
sns.heatmap(avg_attention_symmetric, annot=True, xticklabels=labels, yticklabels=labels)

# Save the plot
plt.savefig("heatmap_300dpi.png", dpi=300)  # Saving with 300dpi resolution
plt.show()

In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import re
import joblib
import torch
from torch.utils.data import Dataset

# Load the data
data = pd.read_csv('Final_Formated_and_cleaned_file_No_Features.csv')


In [22]:
data

Unnamed: 0,composition,Phase
0,Al1 Nb1 Ta1 Ti1,BCC
1,Hf1 Mo0.5 Nb1 Ti1 V0.5,BCC
2,Hf1 Mo1 Nb1 Ta1 Ti1 Zr1,BCC
3,Hf1 Mo1 Nb1 Ti1 Zr1,BCC
4,Hf1 Mo1 Ta1 Ti1 Zr1,BCC
...,...,...
1179,Co1 Fe1 Mn1 Ti1 V1 Zr2.3,Sec
1180,Co1 Fe1 Mn1 Ti1 V1 Zr2.6,Sec
1181,Co1 Fe1 Mn1 Ti1 V1 Zr3,Sec
1182,Al1 Cu0.2 Li0.5 Mg1 Zn0.5,Sec


In [23]:
# Function to tokenize and sort elements
def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([''.join(pair) for pair in sorted_matches])

# Apply the function to the data
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)


In [26]:
data['composition'][0]

'Al1 Nb1 Ta1 Ti1'

In [25]:
data['tokenized_elements'][0]

'Al1 Nb1 Ta1 Ti1'

In [27]:
data[data['composition']!=data['tokenized_elements']]

Unnamed: 0,composition,Phase,tokenized_elements
87,Mn0.253 Cr0.747,BCC,Cr0.747 Mn0.253
88,Mn0.43 Cr0.57,BCC,Cr0.57 Mn0.43
89,Mn0.5 Cr0.5,BCC,Cr0.5 Mn0.5
90,Mn0.6 Cr0.4,BCC,Cr0.4 Mn0.6
91,Mn0.67 Cr0.33,BCC,Cr0.33 Mn0.67
...,...,...,...
1123,Ti0.96 Ta0.04,HCP,Ta0.04 Ti0.96
1124,Ti0.987 Ta0.013,HCP,Ta0.013 Ti0.987
1125,Ti0.97 Nb0.03,HCP,Nb0.03 Ti0.97
1130,Zr0.7 Y0.3,HCP,Y0.3 Zr0.7


In [None]:
data[]

In [24]:
data

Unnamed: 0,composition,Phase,tokenized_elements
0,Al1 Nb1 Ta1 Ti1,BCC,Al1 Nb1 Ta1 Ti1
1,Hf1 Mo0.5 Nb1 Ti1 V0.5,BCC,Hf1 Mo0.5 Nb1 Ti1 V0.5
2,Hf1 Mo1 Nb1 Ta1 Ti1 Zr1,BCC,Hf1 Mo1 Nb1 Ta1 Ti1 Zr1
3,Hf1 Mo1 Nb1 Ti1 Zr1,BCC,Hf1 Mo1 Nb1 Ti1 Zr1
4,Hf1 Mo1 Ta1 Ti1 Zr1,BCC,Hf1 Mo1 Ta1 Ti1 Zr1
...,...,...,...
1179,Co1 Fe1 Mn1 Ti1 V1 Zr2.3,Sec,Co1 Fe1 Mn1 Ti1 V1 Zr2.3
1180,Co1 Fe1 Mn1 Ti1 V1 Zr2.6,Sec,Co1 Fe1 Mn1 Ti1 V1 Zr2.6
1181,Co1 Fe1 Mn1 Ti1 V1 Zr3,Sec,Co1 Fe1 Mn1 Ti1 V1 Zr3
1182,Al1 Cu0.2 Li0.5 Mg1 Zn0.5,Sec,Al1 Cu0.2 Li0.5 Mg1 Zn0.5


In [None]:


# Label encode the 'Phase' column
label_encoder = LabelEncoder()
data['encoded_phase'] = label_encoder.fit_transform(data['Phase'])

# Split the data into training and test sets
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

# Load pretrained BERT model and tokenizer
model_path = './results'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load trained Random Forest model
clf = joblib.load('./results/random_forest_model.pkl')

# Function to predict probabilities using the ensemble model
def predictor(texts):
    # Prepare data for BERT
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
    bert_probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()

    # Prepare data for Random Forest
    METALLIC_ELEMENTS = ["Li", "Be", "B", "C", "N", "O", "Na", "Mg", "Al", "Si", "P", "S",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "Se",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Te",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm",
    "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Tl", "Pb", "Po", "Th", "Pa", "U"]
    
    def get_element_fraction(composition, element):
        if element in composition:
            remainder = composition[composition.index(element):]
            match = re.search(r"(\d+(\.\d+)?)", remainder)
            if match:
                return float(match.group(1))
        return 0.0

    X_rf = np.array([[get_element_fraction(text, el) for el in METALLIC_ELEMENTS] for text in texts])
    
    # Get Random Forest probabilities
    rf_probs = clf.predict_proba(X_rf)

    # Ensemble probabilities
    bert_weight = 0.3
    rf_weight = 0.7
    ensemble_probs = bert_probs * bert_weight + rf_probs * rf_weight

    return ensemble_probs

# Function to split text for LIME
def custom_split(text):
    return re.findall(r'([A-Z][a-z]*[0-9.]+)', text)

# Example to be explained
your_composition = "Co1 Cr1 Fe1 Mn1 Ni0.8 V1"
sorted_composition = custom_tokenize(your_composition)
sample_text = sorted_composition

# Initialize LIME explainer
explainer = LimeTextExplainer(class_names=label_encoder.classes_, split_expression=custom_split)

# Explain the instance
explanation = explainer.explain_instance(sample_text, predictor, num_features=len(label_encoder.classes_), top_labels=3)

# Show and save the explanation
explanation.show_in_notebook(text=sample_text)
explanation.save_to_file('A_explanation_output.html')

import matplotlib.pyplot as plt

# Extract LIME figures for each class label of interest
for label in explanation.top_labels:
    fig = explanation.as_pyplot_figure(label=label)
    plt.savefig(f"lime_explanation_{label_encoder.classes_[label]}.png", dpi=300)
    plt.close(fig)


In [20]:
##################################### LIME   ###########################
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import re
from lime.lime_text import LimeTextExplainer
import joblib
import torch
from torch.utils.data import Dataset

# Load the data
data = pd.read_csv('Final_Formated_and_cleaned_file_No_Features.csv')

# Function to tokenize and sort elements
def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([''.join(pair) for pair in sorted_matches])

# Apply the function to the data
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)

# Label encode the 'Phase' column
label_encoder = LabelEncoder()
data['encoded_phase'] = label_encoder.fit_transform(data['Phase'])

# Split the data into training and test sets
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

# Load pretrained BERT model and tokenizer
model_path = './results'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load trained Random Forest model
clf = joblib.load('./results/random_forest_model.pkl')

# Function to predict probabilities using the ensemble model
def predictor(texts):
    # Prepare data for BERT
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
    bert_probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()

    # Prepare data for Random Forest
    METALLIC_ELEMENTS = ["Li", "Be", "B", "C", "N", "O", "Na", "Mg", "Al", "Si", "P", "S",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "Se",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Te",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm",
    "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Tl", "Pb", "Po", "Th", "Pa", "U"]
    
    def get_element_fraction(composition, element):
        if element in composition:
            remainder = composition[composition.index(element):]
            match = re.search(r"(\d+(\.\d+)?)", remainder)
            if match:
                return float(match.group(1))
        return 0.0

    X_rf = np.array([[get_element_fraction(text, el) for el in METALLIC_ELEMENTS] for text in texts])
    
    # Get Random Forest probabilities
    rf_probs = clf.predict_proba(X_rf)

    # Ensemble probabilities
    bert_weight = 0.3
    rf_weight = 0.7
    ensemble_probs = bert_probs * bert_weight + rf_probs * rf_weight

    return ensemble_probs

# Function to split text for LIME
def custom_split(text):
    return re.findall(r'([A-Z][a-z]*[0-9.]+)', text)

# Example to be explained
your_composition = "Co1 Cr1 Fe1 Mn1 Ni0.8 V1"
sorted_composition = custom_tokenize(your_composition)
sample_text = sorted_composition

# Initialize LIME explainer
explainer = LimeTextExplainer(class_names=label_encoder.classes_, split_expression=custom_split)

# Explain the instance
explanation = explainer.explain_instance(sample_text, predictor, num_features=len(label_encoder.classes_), top_labels=3)

# Show and save the explanation
explanation.show_in_notebook(text=sample_text)
explanation.save_to_file('A_explanation_output.html')

import matplotlib.pyplot as plt

# Extract LIME figures for each class label of interest
for label in explanation.top_labels:
    fig = explanation.as_pyplot_figure(label=label)
    plt.savefig(f"lime_explanation_{label_encoder.classes_[label]}.png", dpi=300)
    plt.close(fig)


ModuleNotFoundError: No module named 'lime'