In [10]:
import pandas as pd
import numpy as np
import torch
from IPython.utils.process import abbrev_cwd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, f1_score
import torch.nn.functional as F

df = pd.read_csv('labeled_news_dataset_9.csv')

# Select necessary columns
df = df[['_id', 'author','hashtags','verified', 'sentiment', 'cleaned_text', 'fake_news_tag']].dropna()

# Rename columns for consistency
df.rename(columns={'cleaned_text': 'text', 'fake_news_tag': 'label'}, inplace=True)

# Remove duplicates based on text
df.drop_duplicates(subset=['text'], inplace=True)

# Shuffle the dataset
df_labeled = df.sample(frac=1).reset_index(drop=True)

# Display class distribution
print("Class Distribution:")
print(df_labeled['label'].value_counts())

texts = df_labeled['text'].tolist()
labels = df_labeled['label'].tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, stratify=labels)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

model_name = "distilbert-base-uncased"  # Using DistilBERT for efficiency

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize training data
train_encodings = tokenizer(
    X_train,
    truncation=True,
    padding=True,
    max_length=256
)

# Tokenize testing data
test_encodings = tokenizer(
    X_test,
    truncation=True,
    padding=True,
    max_length=256
)

print("Tokenization complete!")

class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = FakeNewsDataset(train_encodings, y_train)
test_dataset = FakeNewsDataset(test_encodings, y_test)

device = torch.device("cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    no_cuda=True
)

# Define metric calculation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        'accuracy': (preds == labels).mean()
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

print("Starting training...")
trainer.train()

Class Distribution:
0    850
1    593
Name: label, dtype: int64
Training samples: 1154, Test samples: 289




Tokenization complete!
Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0872,0.265369,0.916955
2,0.2909,0.224082,0.923875
3,0.097,0.264021,0.920415


TrainOutput(global_step=291, training_loss=0.2949536610490729, metrics={'train_runtime': 651.5546, 'train_samples_per_second': 5.313, 'train_steps_per_second': 0.447, 'total_flos': 66282339700944.0, 'train_loss': 0.2949536610490729, 'epoch': 3.0})

In [39]:
# Save the trained model
trainer.save_model('./trained_model_v3')

df_full = pd.read_csv('../../data/processed/tweets_with_sentiment_vader.csv')

# Ensure necessary columns are present and drop NaNs
df_full = df_full[['_id', 'cleaned_text']].dropna()

print(f"Total tweets to predict: {len(df_full)}")

# Prepare the texts for prediction
texts_full = df_full['cleaned_text'].tolist()

# Tokenize the full dataset
encodings_full = tokenizer(
    texts_full,
    truncation=True,
    padding=True,
    max_length=256
)

# Create a dataset object for the full dataset
class FullDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

full_dataset = FullDataset(encodings_full)

# Create a DataLoader for the full dataset
from torch.utils.data import DataLoader

data_loader = DataLoader(full_dataset, batch_size=12)

# Load the trained model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('./trained_model_v3', num_labels=2)
model.eval()

# Ensure the model is on CPU
device = torch.device("cpu")
model.to(device)

# Initialize lists to store predictions and probabilities
all_preds = []
all_probs = []

# Run predictions on the full dataset
import numpy as np
import torch.nn.functional as F

with torch.no_grad():
    for batch in data_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Get model outputs
        outputs = model(**batch)
        logits = outputs.logits

        # Calculate probabilities
        probs = F.softmax(logits, dim=1)

        # Get the probabilities for the 'fake news' class (assuming label '1' is fake news)
        probs_fake = probs[:, 1].cpu().numpy()

        # Get predicted labels
        preds = torch.argmax(probs, dim=1).cpu().numpy()

        # Append results to the lists
        all_preds.extend(preds)
        all_probs.extend(probs_fake)

# Add predictions to the dataframe
df_full['fake_news_pred'] = all_preds
df_full['fake_news_prob'] = all_probs

# Save the results to a CSV file
df_full.to_csv('tweets_with_fake_news_predictions_final.csv', index=False)
print("Predictions saved to 'tweets_with_fake_news_predictions_final.csv'")

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "请求的操作无法在使用用户映射区域打开的文件上执行。" })

In [12]:
# Load the predictions
df_full = pd.read_csv('tweets_with_fake_news_predictions_final.csv')

# Calculate fake news statistics
fake_news_count = df_full['fake_news_pred'].sum()
fake_news_percentage = (fake_news_count / len(df_full))

fake_news_stats = {
    "Total Entries": len(df_full),
    "Fake News Count": fake_news_count,
    "Fake News Percentage": fake_news_percentage
}

print(pd.DataFrame([fake_news_stats]))

   Total Entries  Fake News Count  Fake News Percentage
0         158902              559              0.003518


In [13]:
import pandas as pd

df_full = pd.read_csv('tweets_with_fake_news_predictions.csv')

fake_news_count = df_full['fake_news_pred'].sum()
print(f"Total Fake News Count: {fake_news_count}")

highest_prob_fake_news = df_full[df_full['fake_news_pred'] == 1].nlargest(1, 'fake_news_prob')
print(f"Highest Probability Fake News:\n{highest_prob_fake_news[['cleaned_text', 'fake_news_prob']]}")

lowest_prob_fake_news = df_full[df_full['fake_news_pred'] == 1].nsmallest(1, 'fake_news_prob')
print(f"Lowest Probability Fake News:\n{lowest_prob_fake_news[['cleaned_text', 'fake_news_prob']]}")

highest_prob_real_news = df_full[df_full['fake_news_pred'] == 0].nlargest(1, 'fake_news_prob')
print(f"Highest Probability Real News (misclassified as fake):\n{highest_prob_real_news[['cleaned_text', 'fake_news_prob']]}")

lowest_prob_real_news = df_full[df_full['fake_news_pred'] == 0].nsmallest(1, 'fake_news_prob')
print(f"Lowest Probability Real News:\n{lowest_prob_real_news[['cleaned_text', 'fake_news_prob']]}")

Total Fake News Count: 14753
Highest Probability Fake News:
                                            cleaned_text  fake_news_prob
82980  question anyone following along australiafires...        0.999426
Lowest Probability Fake News:
                                             cleaned_text  fake_news_prob
147294  hmmm bushfire donation scottyfommarketing hori...         0.50005
Highest Probability Real News (misclassified as fake):
                                             cleaned_text  fake_news_prob
148868  sound like liarinchief prayersforaustralia aus...        0.499995
Lowest Probability Real News:
                                            cleaned_text  fake_news_prob
60361  firefighter need funding resource fight bushfi...        0.009818


In [15]:
import pandas as pd

# Load the predictions CSV file (adjust the file path as needed)
df_full = pd.read_csv('tweets_with_fake_news_predictions_final.csv')

# Define the intervals for probabilities
bin_edges = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Use pandas cut function to categorize the fake news probabilities into bins
df_full['probability_bin'] = pd.cut(df_full['fake_news_prob'], bins=bin_edges)

# Calculate the frequency for each bin
probability_distribution = df_full['probability_bin'].value_counts().sort_index()

# Display the distribution in a tabular format
print(probability_distribution)


(0.0, 0.1]    22648
(0.1, 0.2]    80999
(0.2, 0.3]    49742
(0.3, 0.4]     4916
(0.4, 0.5]       38
(0.5, 0.6]       12
(0.6, 0.7]        7
(0.7, 0.8]        9
(0.8, 0.9]        7
(0.9, 1.0]      524
Name: probability_bin, dtype: int64


In [50]:
# Deliver to front end

import pandas as pd
import json
import numpy as np

df_fake_news = pd.read_csv('tweets_with_fake_news_predictions_final.csv')
df_original = pd.read_csv('../../data/processed/tweets_with_sentiment_vader.csv')

# Combine the dataset based on cleaned_text column
df_combined = pd.merge(df_fake_news, df_original, on='cleaned_text')

# Topic dict
topics = {
    1: "Wildlife and Environmental Impact",
    2: "Fundraising and Community Support",
    3: "Political Criticism and Government Response",
    4: "Climate Change Debate",
    5: "Emotional and Spiritual Reactions",
    6: "Geographical and Location-Based Information",
    7: "Regional Air Quality and Environmental Conditions",
    8: "Wildlife Devastation",
    9: "Emergency Information and Public Safety",
    10: "Health and Mental Well-being"
}

# Country dict
country_names = {
    'AF': 'Afghanistan', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa', 'AD': 'Andorra',
    'AO': 'Angola', 'AI': 'Anguilla', 'AQ': 'Antarctica', 'AG': 'Antigua and Barbuda', 'AR': 'Argentina',
    'AM': 'Armenia', 'AW': 'Aruba', 'AT': 'Austria', 'AZ': 'Azerbaijan', 'BS': 'Bahamas',
    'BH': 'Bahrain', 'BD': 'Bangladesh', 'BB': 'Barbados', 'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize',
    'BJ': 'Benin', 'BM': 'Bermuda', 'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina',
    'BW': 'Botswana', 'BR': 'Brazil', 'IO': 'British Indian Ocean Territory', 'BN': 'Brunei Darussalam',
    'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'KH': 'Cambodia', 'CM': 'Cameroon', 'CA': 'Canada',
    'CV': 'Cape Verde', 'KY': 'Cayman Islands', 'CF': 'Central African Republic', 'TD': 'Chad', 'CL': 'Chile',
    'CN': 'China', 'CX': 'Christmas Island', 'CC': 'Cocos (Keeling) Islands', 'CO': 'Colombia', 'KM': 'Comoros',
    'CG': 'Congo', 'CD': 'Congo, Democratic Republic', 'CK': 'Cook Islands', 'CR': 'Costa Rica', 
    'CI': "Côte d'Ivoire", 'HR': 'Croatia', 'CU': 'Cuba', 'CY': 'Cyprus', 'CZ': 'Czech Republic',
    'DK': 'Denmark', 'DJ': 'Djibouti', 'DM': 'Dominica', 'DO': 'Dominican Republic', 'EC': 'Ecuador', 
    'EG': 'Egypt', 'SV': 'El Salvador', 'GQ': 'Equatorial Guinea', 'ER': 'Eritrea', 'EE': 'Estonia', 
    'ET': 'Ethiopia', 'FK': 'Falkland Islands', 'FO': 'Faroe Islands', 'FJ': 'Fiji', 'FI': 'Finland', 'FR': 'France',
    'GF': 'French Guiana', 'PF': 'French Polynesia', 'GA': 'Gabon', 'GM': 'Gambia', 'GE': 'Georgia', 'DE': 'Germany',
    'GH': 'Ghana', 'GI': 'Gibraltar', 'GR': 'Greece', 'GL': 'Greenland', 'GD': 'Grenada', 'GP': 'Guadeloupe', 
    'GU': 'Guam', 'GT': 'Guatemala', 'GG': 'Guernsey', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana', 
    'HT': 'Haiti', 'VA': 'Holy See (Vatican City State)', 'HN': 'Honduras', 'HK': 'Hong Kong', 'HU': 'Hungary', 
    'IS': 'Iceland', 'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran', 'IQ': 'Iraq', 'IE': 'Ireland', 'IM': 'Isle of Man',
    'IL': 'Israel', 'IT': 'Italy', 'JM': 'Jamaica', 'JP': 'Japan', 'JE': 'Jersey', 'JO': 'Jordan', 'KZ': 'Kazakhstan',
    'KE': 'Kenya', 'KI': 'Kiribati', 'KR': 'South Korea', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan', 'LA': 'Laos',
    'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia', 'LY': 'Libya', 'LI': 'Liechtenstein', 
    'LT': 'Lithuania', 'LU': 'Luxembourg', 'MO': 'Macao', 'MK': 'North Macedonia', 'MG': 'Madagascar', 
    'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali', 'MT': 'Malta', 'MH': 'Marshall Islands',
    'MQ': 'Martinique', 'MR': 'Mauritania', 'MU': 'Mauritius', 'YT': 'Mayotte', 'MX': 'Mexico', 'FM': 'Micronesia',
    'MD': 'Moldova', 'MC': 'Monaco', 'MN': 'Mongolia', 'ME': 'Montenegro', 'MS': 'Montserrat', 'MA': 'Morocco', 
    'MZ': 'Mozambique', 'MM': 'Myanmar', 'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands', 
    'NC': 'New Caledonia', 'NZ': 'New Zealand', 'NI': 'Nicaragua', 'NE': 'Niger', 'NG': 'Nigeria', 'NU': 'Niue', 
    'NF': 'Norfolk Island', 'MP': 'Northern Mariana Islands', 'NO': 'Norway', 'OM': 'Oman', 'PK': 'Pakistan', 
    'PW': 'Palau', 'PS': 'Palestine', 'PA': 'Panama', 'PG': 'Papua New Guinea', 'PY': 'Paraguay', 'PE': 'Peru', 
    'PH': 'Philippines', 'PL': 'Poland', 'PT': 'Portugal', 'PR': 'Puerto Rico', 'QA': 'Qatar', 'RO': 'Romania',
    'RU': 'Russia', 'RW': 'Rwanda', 'WS': 'Samoa', 'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia',
    'SN': 'Senegal', 'RS': 'Serbia', 'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore', 'SK': 'Slovakia',
    'SI': 'Slovenia', 'SB': 'Solomon Islands', 'SO': 'Somalia', 'ZA': 'South Africa', 'ES': 'Spain', 'LK': 'Sri Lanka', 
    'SD': 'Sudan', 'SR': 'Suriname', 'SE': 'Sweden', 'CH': 'Switzerland', 'SY': 'Syria', 'TW': 'Taiwan', 'TJ': 'Tajikistan',
    'TZ': 'Tanzania', 'TH': 'Thailand', 'TL': 'Timor-Leste', 'TG': 'Togo', 'TK': 'Tokelau', 'TO': 'Tonga', 
    'TT': 'Trinidad and Tobago', 'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan', 'TC': 'Turks and Caicos Islands', 
    'TV': 'Tuvalu', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates', 'GB': 'United Kingdom', 'US': 'United States', 
    'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu', 'VE': 'Venezuela', 'VN': 'Vietnam', 'WF': 'Wallis and Futuna', 
    'EH': 'Western Sahara', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe',
    # Australia States and Territories (AU_STATES and AU_ALL are for front end calculation)
    'AU-NSW': 'New South Wales, Australia', 'AU-VIC': 'Victoria, Australia', 'AU-QLD': 'Queensland, Australia', 
    'AU-SA': 'South Australia, Australia', 'AU-WA': 'Western Australia, Australia', 'AU-TAS': 'Tasmania, Australia',
    'AU-NT': 'Northern Territory, Australia', 'AU-ACT': 'Australian Capital Territory, Australia'
    , 'AU':  'Australia'
}

# Find the region code based on location column
def map_region_code(location):
    country_code = location.split(',')[-1].strip()
    region_code = [code for code, name in country_names.items() if name == location]
    return region_code[0] if region_code else 'Unknown'

df_combined['region_code'] = df_combined['location'].apply(map_region_code)
df_combined['region_name'] = df_combined['location']

# Remove the time section and keep only the date section
df_combined['created_at'] = pd.to_datetime(df_combined['created_at']).dt.date

# Aggregate data by date and region
grouped_data = df_combined.groupby(['created_at', 'region_code', 'region_name']).agg(
    tweet_count=('cleaned_text', 'count'),
    fake_news_count=('fake_news_pred', 'sum')
).reset_index()

# Calculate the percentage of fake news
grouped_data['fake_news_ratio'] = (grouped_data['fake_news_count'] / grouped_data['tweet_count']) * 100

# Map the topic number to the corresponding name
df_combined['dominant_topic'] = df_combined['dominant_topic'].apply(lambda t: topics.get(t, "Unknown topic"))

# Create a new column for identifying the fake news topic
fake_news_topics = df_combined.groupby(['created_at', 'region_code', 'region_name'])['dominant_topic'].apply(lambda x: list(x.unique())).reset_index(name='fake_news_topics')
grouped_data = pd.merge(grouped_data, fake_news_topics, on=['created_at', 'region_code', 'region_name'])

# Format to .json file
output = {
    "metadata": {
        "start_date": str(df_combined['created_at'].min()),
        "end_date": str(df_combined['created_at'].max())
    },
    "data": []
}

# Cluster by day and format to JSON
for date, group in grouped_data.groupby('created_at'):
    locations = []
    for _, row in group.iterrows():
        locations.append({
            "region_code": row['region_code'],
            "region_name": row['region_name'],
            "tweet_count": int(row['tweet_count']),
            "fake_news_count": int(row['fake_news_count']),
            "fake_news_ratio": round(row['fake_news_ratio'], 4),
            "fake_news_topics": row['fake_news_topics']
        })
    output['data'].append({
        "date": str(date), 
        "locations": locations
    })


# Save the file
def convert_numpy_types(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.datetime64):
        return str(obj)
    else:
        return obj

with open('fake_news_data.json', 'w') as json_file:
    json.dump(output, json_file, indent=4, default=convert_numpy_types)

print("JSON file saved as 'fake_news_data.json'")


JSON file saved as 'fake_news_data.json'


In [51]:
# Check the result

import json

with open('fake_news_data.json', 'r') as json_file:
    data = json.load(json_file)

australia_data = []
for entry in data['data']:
    locations = [loc for loc in entry['locations'] if loc['region_code'].startswith('AU-')]
    if locations:
        australia_data.append({
            "date": entry['date'],
            "locations": locations
        })

print(json.dumps(australia_data[:15], indent=4))


[
    {
        "date": "2019-05-31",
        "locations": [
            {
                "region_code": "AU-TAS",
                "region_name": "Tasmania, Australia",
                "tweet_count": 1,
                "fake_news_count": 0,
                "fake_news_ratio": 0.0,
                "fake_news_topics": [
                    "Wildlife and Environmental Impact"
                ]
            }
        ]
    },
    {
        "date": "2019-09-07",
        "locations": [
            {
                "region_code": "AU-VIC",
                "region_name": "Victoria, Australia",
                "tweet_count": 1,
                "fake_news_count": 0,
                "fake_news_ratio": 0.0,
                "fake_news_topics": [
                    "Wildlife Devastation"
                ]
            }
        ]
    },
    {
        "date": "2019-09-15",
        "locations": [
            {
                "region_code": "AU-NSW",
                "region_name": "New South Wales, Au