# Climate Misinfo - Notebook 1

Notebook by Jenna Sparks

In [None]:
# Load some EDA libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load some prelim ML libs
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization, Dense, Embedding, Input, Concatenate
from tensorflow.keras.models import Model

In [None]:
df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/train.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## General quality check

In [None]:
df.head()

Unnamed: 0,quote,label,source,url,language,subsource,id
0,"There is clear, compelling evidence that many ...",5_science_unreliable,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
1,"For most of the Holocene (last 10k years), sea...",1_not_happening,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,hamburg_test1,
2,"China, which hosts U.N. climate talks next wee...",4_solutions_harmful_unnecessary,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
3,And the fabricated documents (which Dr. Mann a...,0_not_relevant,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
4,It's going to be 42 here today and the hottest...,1_not_happening,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,hamburg_test3,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6091 entries, 0 to 6436
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   quote      6091 non-null   object
 1   label      6091 non-null   object
 2   source     6091 non-null   object
 3   url        6091 non-null   object
 4   language   6091 non-null   object
 5   subsource  1796 non-null   object
 6   id         0 non-null      object
dtypes: object(7)
memory usage: 509.7+ KB


In [None]:
df.nunique()

Unnamed: 0,0
quote,6091
label,8
source,2
url,780
language,1
subsource,10
id,0


In [None]:
df.language.value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
en,6091


## Check out the class imbalance

In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0_not_relevant,1618
5_science_unreliable,801
6_proponents_biased,782
4_solutions_harmful_unnecessary,774
1_not_happening,741
2_not_human,702
3_not_bad,386
7_fossil_fuels_needed,287


## ML Baseline

### Prep Data

In [None]:
# Encode labels as integers - categorical cross entropy loss
df['label'] = df['label'].astype('category').cat.codes
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Verify the shapes
train_df.shape, val_df.shape, test_df.shape

((4263, 7), (914, 7), (914, 7))

### IDK if this should be done

In [None]:
# Combine text fields into one for simplicity
# Text features - use all the features?
text_features = ['quote', 'source', 'url', 'language', 'subsource', 'id']

train_df['combined_text'] = train_df[text_features].fillna('').agg(' '.join, axis=1)
val_df['combined_text'] = val_df[text_features].fillna('').agg(' '.join, axis=1)
test_df['combined_text'] = test_df[text_features].fillna('').agg(' '.join, axis=1)

# Verify new column
train_df.head()

Unnamed: 0,quote,label,source,url,language,subsource,id,combined_text
6216,"On the science of global climate change, I’m a...",5,Desmog,https://www.desmog.com/robert-bryce/,en,,,"On the science of global climate change, I’m a..."
690,the glaciers of Geenland have shown substantia...,1,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,hamburg_test3,,the glaciers of Geenland have shown substantia...
5824,"This, it seems to me, is the most sensible way...",4,Desmog,https://www.desmog.com/jeremy-clarkson/,en,,,"This, it seems to me, is the most sensible way..."
1773,Earth to looney left. A record cold winter is ...,1,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,,Earth to looney left. A record cold winter is ...
602,There hasn't been any sea level rise in the Sa...,1,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,,There hasn't been any sea level rise in the Sa...


### Vectorize for text

In [None]:
# Standardize input
max_vocab_size = 10000
max_sequence_length = 200

vectorizer = TextVectorization(max_tokens=max_vocab_size, output_sequence_length=max_sequence_length)
vectorizer.adapt(train_df['combined_text'].values)

### Build Model

In [None]:
# Input for text data
text_input = Input(shape=(1,), dtype=tf.string, name="text")
x = vectorizer(text_input)
x = Embedding(input_dim=max_vocab_size, output_dim=64)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) # Reduces sequence dimension
x = Dense(32, activation='relu')(x)

In [None]:
# Output layer
output = Dense(df['label'].nunique(), activation='softmax', name="label")(x)

# Model definition
model = Model(inputs=text_input, outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Train Model

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_df['combined_text'], train_df['label'])).batch(32)
# val_data = tf.data.Dataset.from_tensor_slices((val_df['combined_text'], val_df['label'])).batch(32)

history = model.fit(train_data, epochs=20)

Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.2452 - loss: 2.0014
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2527 - loss: 1.9643
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2531 - loss: 1.9441
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2567 - loss: 1.9044
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.3085 - loss: 1.8272
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3573 - loss: 1.7193
Epoch 7/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.3870 - loss: 1.6216
Epoch 8/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4343 - loss: 1.5367
Epoch 9/20
[1m134/134[0m [32m━━━━━━━

In [None]:
val_data = tf.data.Dataset.from_tensor_slices((val_df['combined_text'], val_df['label'])).batch(32)
val_loss, val_acc = model.evaluate(val_data)

print(f"Val Accuracy: {val_acc:.2f}")

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5276 - loss: 1.2887
Test Accuracy: 0.54
