In [None]:
!pip install transformers[torch] accelerate -U



In [None]:

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found. Running on CPU...')
else:
    print('Found GPU at: {}'.format(device_name))


Found GPU at: /device:GPU:0


In [None]:

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import f1_score, precision_score, recall_score



In [None]:
df = pd.read_csv('second_phase_prediction - 副本 -revision.csv')

In [None]:

def ensure_string(x):
    if pd.isna(x):
        return ''
    return str(x)

df['second_phase'] = df['second_phase'].apply(ensure_string)


df['second_phase'] = df['second_phase'].apply(lambda x: x.split('+'))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['second_phase'])

In [None]:

alloy_elements = ['Sn', 'Ga', 'In', 'Al', 'La', 'Mn', 'Zn', 'Y', 'Zr', 'Ca', 'Gd', 'Nd', 'Fe', 'Ni', 'Cu', 'Si', 'Sr']
X_alloy = df[alloy_elements].astype(str).values


X_processing = df['processing'].astype(str).values

In [None]:

def alloy_to_text(alloy):
    components = []
    for element, percentage in zip(alloy_elements, alloy):
        if float(percentage) > 0:
            components.append(f"{percentage} wt.% {element}")
    return "Magnesium alloy containing " + " and ".join(components)

X_alloy_text = [alloy_to_text(alloy) for alloy in X_alloy]

combined_features = []
for i in range(len(X_alloy_text)):
    combined_str = X_alloy_text[i] + ". " + X_processing[i]
    combined_features.append(combined_str)


combined_features = list(map(str, combined_features))

In [None]:

X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)


In [None]:

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=y.shape[1])

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


In [None]:

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512, return_tensors='pt')
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=12,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)



In [None]:

def compute_metrics(p):
    preds = (p.predictions > 0.5).astype(int)
    return {
        'f1': f1_score(p.label_ids, preds, average='weighted'),
        'precision': precision_score(p.label_ids, preds, average='weighted'),
        'recall': recall_score(p.label_ids, preds, average='weighted'),
    }


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0543,0.050218,0.0,0.0,0.0
2,0.0352,0.03645,0.142718,0.177406,0.147351
3,0.0193,0.028474,0.284705,0.434785,0.281457
4,0.0227,0.022588,0.468486,0.61035,0.415563
5,0.0207,0.020612,0.560596,0.675525,0.519868
6,0.0166,0.01811,0.637741,0.723314,0.600993
7,0.0121,0.016512,0.671094,0.754482,0.639073
8,0.0132,0.015884,0.726694,0.780926,0.721854
9,0.0094,0.014457,0.733054,0.802206,0.698675
10,0.0108,0.014104,0.766641,0.810694,0.75


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(averag

TrainOutput(global_step=7968, training_loss=0.037860904175537564, metrics={'train_runtime': 904.2831, 'train_samples_per_second': 17.623, 'train_steps_per_second': 8.811, 'total_flos': 1132151917215744.0, 'train_loss': 0.037860904175537564, 'epoch': 12.0})

In [None]:

trainer.evaluate()


predictions = trainer.predict(test_dataset)
preds = (predictions.predictions > 0.5).astype(int)


pred_labels = mlb.inverse_transform(preds)


for i, (description, pred) in enumerate(zip(X_test, pred_labels)):
    if not pred:
        print(f"Sample {i} description: {description}")
        print("Predicted second phases: ('unknown',)")
    else:
        print(f"Sample {i} description: {description}")
        print(f"Predicted second phases: {pred}")
    print()

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sample 0 description: Magnesium alloy containing 0.59 wt.% Mn and 0.0024 wt.% Fe and 0.001 wt.% Ni and 0.0015 wt.% Cu and 0.0184 wt.% Si and 0.45 wt.% Sr. The alloy was prepared by the conventional gravity casting method and was in the as-cast state.
Predicted second phases: ('Mg17Sr2', 'Mn')

Sample 1 description: Magnesium alloy containing 4.53 wt.% Y and 0.49 wt.% Zr and 7.04 wt.% Gd and 1.29 wt.% Nd. The alloy was prepared by the conventional gravity casting method and was in the as-cast state.
Predicted second phases: ('Mg24(Gd,Y,Nd)5', 'Mg41(Gd,Y,Nd)5', 'Mg5(Gd,Y,Nd)')

Sample 2 description: Magnesium alloy containing 2.0 wt.% Zn and 0.4 wt.% Zr and 6.0 wt.% Gd. The alloy was homogenized at 535 °C for 12 h.
Predicted second phases: ('(Mg,Zn)3Gd',)

Sample 3 description: Magnesium alloy containing 0.81 wt.% Mn and 0.53 wt.% Zn and 3.85 wt.% Y and 8.93 wt.% Gd and 0.91 wt.% Ni. The alloy was homogenized at 500 °C for 10 h.
Predicted second phases: ('Mg12Ni(Gd,Y)', 'Mg12Zn(Gd,Y)')



In [None]:

trainer.evaluate()


train_predictions = trainer.predict(train_dataset)
train_preds = (train_predictions.predictions > 0.5).astype(int)


train_pred_labels = mlb.inverse_transform(train_preds)

for i, (description, pred) in enumerate(zip(X_train, train_pred_labels)):
    if not pred:
        print(f"Sample {i} description: {description}")
        print("Predicted second phases: ('unknown',)")
    else:
        print(f"Sample {i} description: {description}")
        print(f"Predicted second phases: {pred}")
    print()

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sample 0 description: Magnesium alloy containing 13.3 wt.% Al. The alloy was in the as-cast state.
Predicted second phases: ('Mg17Al12',)

Sample 1 description: Magnesium alloy containing 3.7 wt.% Zn and 2.06 wt.% Ca and 0.36 wt.% Gd. The alloy was in the as-cast state.
Predicted second phases: ('Ca2Mg6Zn3', 'Mg2Ca')

Sample 2 description: Magnesium alloy containing 3.0 wt.% Zn and 1.0 wt.% Y and 4.0 wt.% Cu. Direct extrusion was conducted at 310 °C with the extrusion ratio of 16:1 and a ram speed of 0.017 mm s-1.
Predicted second phases: ('Mg3Zn3Y2', 'MgZnCu')

Sample 3 description: Magnesium alloy containing 0.97 wt.% Sn and 2.08 wt.% In. The alloy was in the as-cast state.
Predicted second phases: ('Mg2Sn',)

Sample 4 description: Magnesium alloy containing 1.0 wt.% Al and 4.0 wt.% Y. The alloy was in the as-cast state.
Predicted second phases: ('Al11Y3', 'Al2Y')

Sample 5 description: Magnesium alloy containing 0.59 wt.% Mn and 1.65 wt.% Zn and 5.3 wt.% Y and 8.4 wt.% Gd. The alloy

In [None]:



unknown_df = pd.read_csv('alloy_data_with_dft_variance (5).csv')


X_unknown_alloy = unknown_df[alloy_elements].astype(str).values


X_unknown_processing = unknown_df['processing'].astype(str).values


X_unknown_alloy_text = [alloy_to_text(alloy) for alloy in X_unknown_alloy]


unknown_features = []
for i in range(len(X_unknown_alloy_text)):
    combined_str = X_unknown_alloy_text[i] + ". " + X_unknown_processing[i]
    unknown_features.append(combined_str)


unknown_features = list(map(str, unknown_features))


unknown_encodings = tokenizer(unknown_features, truncation=True, padding=True, max_length=512, return_tensors='pt')


dummy_labels = [[0]*y.shape[1]]*len(unknown_encodings['input_ids'])


unknown_dataset = CustomDataset(unknown_encodings, dummy_labels)


predictions = trainer.predict(unknown_dataset)
preds = (predictions.predictions > 0.5).astype(int)


pred_labels = mlb.inverse_transform(preds)


unknown_df['second_phase'] = ['+'.join(label) if label else 'unknown' for label in pred_labels]


unknown_df.to_csv('predicted_alloy_data.csv', index=False)



print(unknown_df[['Sn', 'Ga', 'In', 'Al', 'La', 'Mn', 'Zn', 'Y', 'Zr', 'Ca', 'Gd', 'Nd', 'Fe', 'Ni', 'Cu', 'Si', 'Sr','second_phase']])

In [None]:
print(unknown_df[['Sn', 'Ga', 'In', 'Al', 'La', 'Mn', 'Zn', 'Y', 'Zr', 'Ca', 'Gd', 'Nd', 'Fe', 'Ni', 'Cu', 'Si', 'Sr', 'processing','second_phase']])

          Sn   Ga   In   Al   La   Mn   Zn    Y   Zr   Ca   Gd   Nd   Fe   Ni  \
0        0.5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1        1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2        1.5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3        2.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4        2.5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2668145  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2668146  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2668147  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2668148  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2668149  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

           Cu   Si   Sr    

In [None]:
unknown_df.to_csv('predicted_alloy_data.csv', index=False)

NameError: name 'unknown_df' is not defined