In [None]:
!pip install matminer
!pip install skorch

In [None]:
from matminer.datasets.convenience_loaders import load_mp
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

# Homework: Neural Networks

## Material Property Prediction: Bulk Modulus

Reference: Dunn, A., Wang, Q., Ganose, A. et al. <a href='https://www.nature.com/articles/s41524-020-00406-3'>Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm</a>. npj Comput Mater 6, 138 (2020).



In [None]:
df = load_mp()  # loads dataset in a pandas DataFrame object

# Convert formula to composition
df = StrToComposition().featurize_dataframe(df, "formula", ignore_errors=True) 

# Create features based on composition
ep_feat = ElementProperty.from_preset(preset_name="magpie") 

# input the "composition" column to the featurizer
df = ep_feat.featurize_dataframe(df, col_id="composition", ignore_errors=True)  

# drop rows with NaN values
df = df.dropna(axis=0) 
df.head()

In [None]:
# Create combined bulk & shear modulus task array
y = df[['shear modulus', 'bulk modulus']].values

# Drop non-numerical features and tasks from data frame
excluded = ['shear modulus', 'bulk modulus', 'formula', 'composition', 'mpid', \
            'e_hull', 'gap pbe', 'elastic anisotropy', 'e_form']
X = df.drop(excluded, axis=1).values

# Standardize input data
X = (X-X.mean(axis=-1, keepdims=True))/X.std(axis=-1, keepdims=True)

# Convert numpy array to pytorch
X = torch.tensor(X).float()
y = torch.tensor(y).float()
print(X.shape, y.shape)

## Question 1: Visualize the data
Visualize and comment on the data distribution for the tasks `bulk modulus` (k) and `shear modulus` (G). Perform a log-transformation of the data and re-plot the distributions.

## Question 2: Neural network hyperparameter optimization

Split the dataset into 90% training and 10% testing. Build a neural network with 2 hidden layers and `nn.ReLU()` activation functions. Use `skorch` to optimize the parameters of the neural network using the log-transformed data. To the best of your ability, tune the dimension of the hidden layers and the learning rate to optimize the validation score. Feel free to also make the model deeper or add regularization such as batch normalization or dropout.  

## Question 3: Evaluate performance

Determine the test set performance (R2 score and mean absolute error) of your best performing model for the shear modulus (logG) and bulk modulus (logK) prediction. Compare with the results presented in the reference (see Figure 4).