In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset
from torch.utils.data import get_worker_info
from torch.utils.data import RandomSampler
from collections import Counter
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer



In [31]:
def data_iterator(csv_file='E:/ML/DS_fake_news/fake_news_cleaned.csv'):
    data = pd.read_csv(csv_file, usecols=['content', 'type'], chunksize=2000)
    label_map = {'bias': 0,
                        'clickbait': 1,
                        'conspiracy': 2,
                        'fake': 3,
                        'hate': 4,
                        'junksci': 5,
                        'political': 6,
                        'reliable': 7,
                        'rumor': 8,
                        'satire': 9,
                        'unreliable': 10}
    for chunk in data:
        # throw away rows with missing type
        chunk = chunk.dropna(subset=['type'])
        # drop rows with 'unknown' type
        chunk = chunk[chunk['type'] != 'unknown']
        chunk['type'] = chunk['type'].map(label_map)
        yield chunk

In [44]:
import pickle
# loading vocabulary from pickle file
vocab = pickle.load(open('E:/ML/DS_fake_news/vocab.pkl', 'rb'))
#convert to dictionary
vocab = {word: i for word, i in vocab}
# remove words that appear less than 2000 times
vocab = [word for word in vocab if vocab[word] > 1000]
# replace value with index
vocab = {word: i for i, word in enumerate(vocab)}


In [48]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**16)



In [94]:
data = data_iterator()

for chunk in data:
    # transform the text to tf-idf
    tfidf = vectorizer.transform(chunk['content'])
    labels = chunk['type']
    break

In [96]:


X_data = tfidf
y_data = labels

In [97]:
# split the dataset using sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42, stratify=y_data)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [102]:
# convert to coo matrix
X_train = X_train.tocoo()
X_test = X_test.tocoo()
X_val = X_val.tocoo()

In [104]:
# convert to torch tensors
X_train = torch.sparse_coo_tensor(torch.LongTensor([X_train.row, X_train.col]), torch.FloatTensor(X_train.data), X_train.shape)
X_test = torch.sparse_coo_tensor(torch.LongTensor([X_test.row, X_test.col]), torch.FloatTensor(X_test.data), X_test.shape)
X_val = torch.sparse_coo_tensor(torch.LongTensor([X_val.row, X_val.col]), torch.FloatTensor(X_val.data), X_val.shape)

In [None]:
from torch import nn
model = nn.Sequential(
             nn.Linear(X_train.shape[1], 64),
             nn.ReLU(),
             nn.Linear(64, len(set(y_train))),
             nn.LogSoftmax(dim=1))
# Define the loss
criterion = nn.NLLLoss()
# Forward pass, log  
logps = model(X_train)
# Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()
# Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
epochs = 50
for e in range(epochs):
    optimizer.zero_grad()
    output = model.forward(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(X_test)
        test_loss = criterion(log_ps, y_test)
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
    
    print(f"Epoch {e+1}/{epochs}.. ",
            f"Train loss: {loss:.3f}.. ",
            f"Test loss: {test_loss:.3f}.. ",
            f"Test accuracy: {test_accuracy:.3f}")

Epoch 1/50..  Train loss: 7.141..  Test loss: 7.004..  Test accuracy: 0.430
Epoch 2/50..  Train loss: 6.946..  Test loss: 6.800..  Test accuracy: 0.447
Epoch 3/50..  Train loss: 6.739..  Test loss: 6.583..  Test accuracy: 0.442
Epoch 4/50..  Train loss: 6.520..  Test loss: 6.354..  Test accuracy: 0.432
Epoch 5/50..  Train loss: 6.288..  Test loss: 6.113..  Test accuracy: 0.421
Epoch 6/50..  Train loss: 6.044..  Test loss: 5.861..  Test accuracy: 0.420
Epoch 7/50..  Train loss: 5.789..  Test loss: 5.598..  Test accuracy: 0.434
Epoch 8/50..  Train loss: 5.524..  Test loss: 5.327..  Test accuracy: 0.471
Epoch 9/50..  Train loss: 5.250..  Test loss: 5.050..  Test accuracy: 0.497
Epoch 10/50..  Train loss: 4.971..  Test loss: 4.771..  Test accuracy: 0.512
Epoch 11/50..  Train loss: 4.689..  Test loss: 4.493..  Test accuracy: 0.526
Epoch 12/50..  Train loss: 4.410..  Test loss: 4.221..  Test accuracy: 0.545
Epoch 13/50..  Train loss: 4.137..  Test loss: 3.960..  Test accuracy: 0.547
Epoch 14

In [None]:
# update the data to the next 
