In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset
from torch.utils.data import get_worker_info
from torch.utils.data import RandomSampler
from collections import Counter
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer



In [4]:
def data_iterator(csv_file='E:/ML/DS_fake_news/fake_news_cleaned.csv'):
    data = pd.read_csv(csv_file, usecols=['content', 'type'], chunksize=10000)
    label_map = {'bias': 0,
                        'clickbait': 0,
                        'conspiracy': 0,
                        'fake': 1,
                        'hate': 1,
                        'junksci': 1,
                        'political': 0,
                        'reliable': 0,
                        'rumor': 0,
                        'satire': 0,
                        'unreliable': 1}
    for chunk in data:
        # throw away rows with missing type
        chunk = chunk.dropna(subset=['type'])
        # drop rows with 'unknown' type
        chunk = chunk[chunk['type'] != 'unknown']
        chunk['type'] = chunk['type'].map(label_map)
        yield chunk

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**20, stop_words='english')



In [6]:
data = data_iterator()

for chunk in data:
    # transform the text to tf-idf
    tfidf = vectorizer.transform(chunk['content'])
    labels = chunk['type']
    # convert labels to tensor
    labels = torch.tensor(labels.values)
    break

In [38]:
# get chunk 2
for chunk in data:
    # transform the text to tf-idf
    tfidf = vectorizer.transform(chunk['content'])
    labels = chunk['type']
    # convert labels to tensor
    labels = torch.tensor(labels.values)
    break

In [7]:
X_data = tfidf
y_data = labels

In [8]:
label_map = {'bias': 0,
                        'clickbait': 1,
                        'conspiracy': 2,
                        'fake': 3,
                        'hate': 4,
                        'junksci': 5,
                        'political': 6,
                        'reliable': 7,
                        'rumor': 8,
                        'satire': 9,
                        'unreliable': 10}
# flip label_map
label_map = {v: k for k, v in label_map.items()}
# statistics with label names
label_count = Counter(y_data.numpy())
# map keys to label dict
label_count = {label_map[k]: v for k, v in label_count.items()}
print(label_count)

{'bias': 4908, 'clickbait': 4983}


In [9]:
label_count = Counter(y_data.numpy())
print(label_count)

# remove labels with less than 2 samples
for label in label_count:
    if label_count[label] < 11:
        X_data = X_data[y_data != label]
        y_data = y_data[y_data != label]

label_count = Counter(y_data.numpy())
print(label_count)

Counter({1: 4983, 0: 4908})
Counter({1: 4983, 0: 4908})


In [10]:
# split the dataset using sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42, stratify=y_data)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [11]:
# convert to coo matrix
X_train = X_train.tocoo()
X_test = X_test.tocoo()
X_val = X_val.tocoo()

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# convert to torch tensors
X_train = torch.sparse_coo_tensor(torch.LongTensor([X_train.row, X_train.col]), torch.FloatTensor(X_train.data), X_train.shape).to(device)
X_test = torch.sparse_coo_tensor(torch.LongTensor([X_test.row, X_test.col]), torch.FloatTensor(X_test.data), X_test.shape).to(device)
X_val = torch.sparse_coo_tensor(torch.LongTensor([X_val.row, X_val.col]), torch.FloatTensor(X_val.data), X_val.shape).to(device)

y_train = y_train.to(device)
y_test = y_test.to(device)
y_val = y_val.to(device)

  X_train = torch.sparse_coo_tensor(torch.LongTensor([X_train.row, X_train.col]), torch.FloatTensor(X_train.data), X_train.shape).to(device)


In [15]:
from torch import nn
model = nn.Sequential(
             nn.Linear(X_train.shape[1], 64),
             nn.ReLU(),
             nn.Dropout(0.2),
             nn.Linear(64, len(set(y_train))),
             nn.LogSoftmax(dim=1)).to(device)
# Define the loss
criterion = nn.NLLLoss()
# Forward pass, log  
logps = model(X_train)
# Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()
# Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

In [229]:
epochs = 100
for e in range(epochs):
    optimizer.zero_grad()
    output = model.forward(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(X_test)
        test_loss = criterion(log_ps, y_test)
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
    
    print(f"Epoch {e+1}/{epochs}.. ",
            f"Train loss: {loss:.3f}.. ",
            f"Test loss: {test_loss:.3f}.. ",
            f"Test accuracy: {test_accuracy:.3f}")

Epoch 1/100..  Train loss: 2.423..  Test loss: 2.210..  Test accuracy: 0.148
Epoch 2/100..  Train loss: 2.225..  Test loss: 1.955..  Test accuracy: 0.280
Epoch 3/100..  Train loss: 1.961..  Test loss: 1.692..  Test accuracy: 0.632
Epoch 4/100..  Train loss: 1.687..  Test loss: 1.461..  Test accuracy: 0.693
Epoch 5/100..  Train loss: 1.445..  Test loss: 1.282..  Test accuracy: 0.747
Epoch 6/100..  Train loss: 1.254..  Test loss: 1.153..  Test accuracy: 0.768
Epoch 7/100..  Train loss: 1.114..  Test loss: 1.061..  Test accuracy: 0.774
Epoch 8/100..  Train loss: 1.010..  Test loss: 0.990..  Test accuracy: 0.792
Epoch 9/100..  Train loss: 0.927..  Test loss: 0.933..  Test accuracy: 0.793
Epoch 10/100..  Train loss: 0.857..  Test loss: 0.884..  Test accuracy: 0.797
Epoch 11/100..  Train loss: 0.796..  Test loss: 0.841..  Test accuracy: 0.798
Epoch 12/100..  Train loss: 0.740..  Test loss: 0.803..  Test accuracy: 0.803
Epoch 13/100..  Train loss: 0.690..  Test loss: 0.770..  Test accuracy: 0

In [None]:
torch.save(model.state_dict(), f'./models/tf-idf-{chunk}.pth')
        print(f'Saved model at chunk {chunk}')


In [16]:
# load model from file
model.load_state_dict(torch.load('./tf-idf-100.pth'))

<All keys matched successfully>

In [17]:
# test accuracy
with torch.no_grad():
    model.eval()
    log_ps = model(X_train)
    test_loss = criterion(log_ps, y_train)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == y_train.view(*top_class.shape)
    test_accuracy = torch.mean(equals.float())

In [18]:
print(f"Test loss: {test_loss:.3f}.. ", f"Test accuracy: {test_accuracy:.3f}")

Test loss: 2.358..  Test accuracy: 0.507


In [None]:
# %%
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import pandas as pd
from torch import nn

# %%
def data_iterator(csv_file='E:/ML/DS_fake_news/fake_news_cleaned.csv'):
    data = pd.read_csv(csv_file, usecols=['content', 'type'], chunksize=2000)
    label_map = {'bias': 0,
                        'clickbait': 0,
                        'conspiracy': 0,
                        'fake': 1,
                        'hate': 1,
                        'junksci': 1,
                        'political': 0,
                        'reliable': 0,
                        'rumor': 0,
                        'satire': 0,
                        'unreliable': 1}
    for chunk in data:
        # throw away rows with missing type
        chunk = chunk.dropna(subset=['type'])
        # drop rows with 'unknown' type
        chunk = chunk[chunk['type'].isin(label_map.keys())]
        chunk['type'] = chunk['type'].map(label_map)
        yield chunk

# %%
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**20, stop_words='english')



# %%
data = data_iterator()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
first = True
chunk_num = 0
for chunk in data:
    # transform the text to tf-idf
    try:
        tfidf = vectorizer.transform(chunk['content'])
    except:
        chunk_num += 1
        continue
    labels = chunk['type']
    # convert labels to tensor
    labels = torch.tensor(labels.values)


    # %%
    X_data = tfidf
    y_data = labels

    # %%
    # count each label
    label_count = Counter(y_data.numpy())

    # remove labels with less than 2 samples
    for label in label_count:
        if label_count[label] < 2:
            X_data = X_data[y_data != label]
            y_data = y_data[y_data != label]
    label_count = Counter(y_data.numpy())
    # check if any of the labels have 0 samples
    if len(label_count) < 2:
        chunk_num += 1
        continue
    # if the two classes are not balanced, skip chunk
    label_count = Counter(y_data.numpy())
    print(label_count)
    if label_count[0]/label_count[1] < 0.5 or label_count[0]/ label_count[1] > 2:
        chunk_num += 1
        continue


    print(Counter(y_data.numpy()))

    # %%
    # split the dataset using sklearn
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42, stratify=y_data)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

    # %%
    # convert to coo matrix
    X_train = X_train.tocoo()
    X_test = X_test.tocoo()
    X_val = X_val.tocoo()

    # %%

    # convert to torch tensors
    X_train = torch.sparse_coo_tensor(torch.LongTensor([X_train.row, X_train.col]), torch.FloatTensor(X_train.data), X_train.shape).to(device)
    X_test = torch.sparse_coo_tensor(torch.LongTensor([X_test.row, X_test.col]), torch.FloatTensor(X_test.data), X_test.shape).to(device)
    X_val = torch.sparse_coo_tensor(torch.LongTensor([X_val.row, X_val.col]), torch.FloatTensor(X_val.data), X_val.shape).to(device)

    y_train = y_train.to(device)
    y_test = y_test.to(device)
    y_val = y_val.to(device)

    # %%
    # run once only

    if first:
        model = nn.Sequential(
                    nn.Linear(X_train.shape[1], 128),
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(128, 64),
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(64, len(set(y_train))),
                    nn.LogSoftmax(dim=1)).to(device)
        # Define the loss
        criterion = nn.NLLLoss()
        # Forward pass, log  
        logps = model(X_train)
        # Calculate the loss with the logits and the labels
        loss = criterion(logps, y_train)
        loss.backward()
        # Optimizers need parameters to optimize and a learning rate
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        first = False

    # %%
    epochs = 20
    for e in range(epochs):
        optimizer.zero_grad()
        output = model.forward(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            model.eval()
            log_ps = model(X_test)
            test_loss = criterion(log_ps, y_test)
            ps = torch.exp(log_ps)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == y_test.view(*top_class.shape)
            test_accuracy = torch.mean(equals.float())
        
        print(f"Epoch {e+1}/{epochs}.. ",
                f"Train loss: {loss:.3f}.. ",
                f"Test loss: {test_loss:.3f}.. ",
                f"Test accuracy: {test_accuracy:.3f}")
    chunk_num += 1
    print(f'Finished chunk {chunk_num}')
    # save model every 10 chunks
    if chunk_num % 100 == 0:
        torch.save(model.state_dict(), f'./tf-idf-{chunk_num}.pth')
        print(f'Saved model at chunk {chunk_num}')
torch.save(model.state_dict(), f'./tf-idf-{finished}.pth')
print(f'Saved model at chunk {finished}')