In [6]:
# beautifulsoup and requests to get content from url

from bs4 import BeautifulSoup
import requests

def get_text_from_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup.get_text()

text = get_text_from_html("https://lpdaac.usgs.gov/news/release-nasadem-data-products/")

In [7]:
import pandas as pd

# Replace 'your_excel_file.xlsx' with the actual file path of your Excel sheet.
excel_file_path = "ej_dataset.xlsx"

# Load the Excel sheet into a DataFrame.
df = pd.read_excel(excel_file_path, engine="openpyxl")
# Display the extracted information.
print(df)
#df remove none
df = df.dropna()

             Indicators                                        Description
0     Food Availability  The NASA Making Earth System Data Records for ...
1     Food Availability  The NASA Making Earth System Data Records for ...
2     Food Availability  The NASA Making Earth System Data Records for ...
3     Food Availability  The NASA Making Earth System Data Records for ...
4     Food Availability  The NASA Making Earth System Data Records for ...
..                  ...                                                ...
254      Urban Flooding  The Global Flood Hazard Frequency and Distribu...
255      Urban Flooding  Precipitation data from the GPM and TRMM missi...
256      Urban Flooding  The Precipitation Processing System (PPS) evol...
257      Urban Flooding  The Low Elevation Coastal Zone (LECZ) Urban-Ru...
258  Water Availability  This dataset contains gridded monthly global w...

[259 rows x 2 columns]


In [8]:
## encode labels
from sklearn.preprocessing import LabelEncoder

df.Indicators.unique()
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Indicators'])
# print the label mapping
for index, item in enumerate(label_encoder.classes_):
    print(item, '->', index)

Climate Change -> 0
Disasters -> 1
Extreme Heat -> 2
Food Availability -> 3
Health & Air Quality -> 4
Human Dimensions -> 5
Urban Flooding -> 6
Water Availability -> 7


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Load your domain-specific encoder model (replace 'model_name' with your model's name)
model_name = 'nasa-impact/nasa-smd-ibm-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load your dataset into a DataFrame (replace 'your_data.csv' with your data file)
data = df

# Split the data into training and testing sets (80% for training, 20% for testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Indicators'])

# Use LabelEncoder to convert class labels (Indicators) to numerical labels
label_encoder = LabelEncoder()
train_data['labels'] = label_encoder.fit_transform(train_data['Indicators'])
test_data['labels'] = label_encoder.transform(test_data['Indicators'])

# Tokenize the descriptions and create dataset objects
train_encodings = tokenizer(list(train_data['Description']), return_tensors='pt')
test_encodings = tokenizer(list(test_data['Description']), return_tensors='pt')

train_dataset = CustomDataset(train_encodings, torch.tensor(train_data['labels'].tolist()))
test_dataset = CustomDataset(test_encodings, torch.tensor(test_data['labels'].tolist()))

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    output_dir="./output",
    num_train_epochs=10,
    save_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    logging_dir="./logs",
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: classification_report(p.label_ids, p.predictions.argmax(-1), output_dict=True),
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print classification report
print("Classification Report:")
print(results)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at nasa-impact/nasa-smd-ibm-v0.1 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:

# Save the trained model
model.save_pretrained('ej_classifier')  # Replace 'your_model_directory' with your desired directory
tokenizer.save_pretrained('ej_tokenizer')  # Save the tokenizer as well

('ej_tokenizer/tokenizer_config.json',
 'ej_tokenizer/special_tokens_map.json',
 'ej_tokenizer/vocab.json',
 'ej_tokenizer/merges.txt',
 'ej_tokenizer/added_tokens.json',
 'ej_tokenizer/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('ej_classifier')  # Load from the directory where you saved it
tokenizer = AutoTokenizer.from_pretrained('ej_tokenizer')

# Get predictions
with torch.no_grad():
    outputs = model(**test_encodings)

# Extract predicted class labels
predicted_labels = torch.argmax(outputs.logits, dim=1)

predicted_class_labels = label_encoder.inverse_transform(predicted_labels.numpy())



Description: This monthly climatology data set contains a series of land surface parameters simulated from the Noah land-surface model (LSM) for Phase 2 of the North American Land Data Assimilation System (NLDAS-2).  The data are in 1/8th degree grid spacing. The temporal resolution is monthly, ranging from January to December. The NLDAS-2 monthly climatology data are the monthly data averaged over the thirty years (1980 - 2009) of the NLDAS-2 monthly data. The file format is WMO GRIB-1.
Predicted Label: Climate Change

Description: The NASA Making Earth System Data Records for Use in Research Environments (MEaSUREs) (https://earthdata.nasa.gov/community/community-data-system-programs/measures-projects) Global Food Security-support Analysis Data (GFSAD) data product provides cropland extent data over Southeast and Northeast Asia for nominal year 2015 at 30 meter resolution (GFSAD30SEACE). The monitoring of global cropland extent is critical for policymaking and provides important basel

Prediction: Climate Change, True Label: Extreme Heat
Prediction: Disasters, True Label: Food Availability
Prediction: Climate Change, True Label: Extreme Heat
Prediction: Disasters, True Label: Health & Air Quality
Prediction: Disasters, True Label: Disasters
Prediction: Disasters, True Label: Human Dimensions
Prediction: Disasters, True Label: Human Dimensions
Prediction: Disasters, True Label: Human Dimensions
Prediction: Climate Change, True Label: Health & Air Quality
Prediction: Climate Change, True Label: Health & Air Quality
Prediction: Disasters, True Label: Health & Air Quality
Prediction: Disasters, True Label: Urban Flooding
Prediction: Disasters, True Label: Health & Air Quality
Prediction: Disasters, True Label: Extreme Heat
Prediction: Disasters, True Label: Human Dimensions
Prediction: Disasters, True Label: Extreme Heat
Prediction: Climate Change, True Label: Health & Air Quality
Prediction: Disasters, True Label: Human Dimensions
Prediction: Climate Change, True Label: