In [15]:
from sklearn.model_selection import train_test_split
def split_data(input, answers, test_size=0.1, val_size=0.1):
    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(input, answers, test_size=test_size+val_size, random_state=42, stratify=answers)
    # split the test data into test and validation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_size/(test_size+val_size), random_state=42, stratify=y_test)
    return X_train, X_test, y_train, y_test, X_val, y_val

In [25]:
import pandas as pd
#loading data
skiprows = pd.read_parquet('E:/ML/DS_fake_news/skip_list.parquet')
df = pd.read_csv('E:/ML/DS_fake_news/fake_news_cleaned.csv', nrows=10000, usecols=['content', 'type'], skiprows=skiprows['bad_row_index'].tolist())
df = df[df['content'].apply(lambda x: isinstance(x, str))]


In [26]:

# clean the data
from cleantext import clean
import swifter
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )

from os import cpu_count
from joblib import Parallel, delayed
# clean the text
def clean_column(series):
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(clean_text)(s) for s in series)

df['content'] = clean_column(df['content'])

# tokenization
from nltk.tokenize import word_tokenize
import nltk

def tokenize_column(series):
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(word_tokenize)(s) for s in series)

df['content'] = pd.Series(tokenize_column(df['content']))

# stopword removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(s):
    return [w for w in s if not w in stop_words]

def remove_stopwords_column(series):
    return Parallel(n_jobs=cpu_count())(delayed(remove_stopwords)(s) for s in series)

df['content'] = pd.Series(remove_stopwords_column(df['content']))

# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(s):
    return [lemmatizer.lemmatize(w) for w in s]
def lemmatize_column(series):
    return Parallel(n_jobs=cpu_count())(delayed(lemmatize)(s) for s in series)

df['content'] = pd.Series(lemmatize_column(df['content']))

# remove punctuation
import string
def remove_punctuation(s):
    return [w for w in s if w not in string.punctuation]

def remove_punctuation_column(series):
    return Parallel(n_jobs=cpu_count())(delayed(remove_punctuation)(s) for s in series)

df['content'] = pd.Series(remove_punctuation_column(df['content']))



In [27]:
# make a word embedding for each word in the vocab
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

vocab = pd.read_pickle('E:/ML/DS_fake_news/vocab.pkl')
# filter out words that appear less than 2000 times
vocab = [(word, count) for word, count in vocab if count > 2000]
# add stopwords to the vocab
#from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))

#for word in stop_words:
#    vocab.append((word, 0))

vocab = [word for word, count in vocab]
word_to_ix = {vocab[i]: i for i in range(len(vocab))}
vocab = set(vocab)
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10


In [28]:
from nltk import word_tokenize
# computing ngrams
def ngrams(input, n):
    ngrams = [
        (
            [input[i - j - 1] for j in range(CONTEXT_SIZE)],
            input[i]
        )
        for i in range(CONTEXT_SIZE, len(input))]
    
    return ngrams

In [29]:
import torch.nn as nn

# define the model
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        #print embedding properties
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

# define loss function
losses = []
loss_function = nn.NLLLoss()
# run loss function on gpu
#if torch.cuda.is_available():
#    loss_function.cuda()

# initialize the model
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

# gpu support
#if torch.cuda.is_available():
#    model.cuda()
#    print('Using GPU')


In [30]:

# train the model
for epoch in range(10):
    total_loss = 0
    for context, target in ngrams(df['content'][0], CONTEXT_SIZE):
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words using cuda
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        print(target)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["trump"]])

least
quantum
level
theory
recently
confirmed
set
researcher
finally
mean
test
john
wheeler
's
delayed-choice


KeyError: 'delayed-choice'

In [9]:
#splitting data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(bow, df['type'], test_size=0.1, val_size=0.1)

In [10]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [12]:
# multi layer classifier with pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# convert to tensors
X_train = torch.from_numpy(X_train.toarray()).float()
X_test = torch.from_numpy(X_test.toarray()).float()
X_val = torch.from_numpy(X_val.toarray()).float()
y_train = torch.from_numpy(y_train.values).long()
y_test = torch.from_numpy(y_test.values).long()
y_val = torch.from_numpy(y_val.values).long()


In [None]:

# create model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(10000, 1000)
        self.fc2 = nn.Linear(1000, 100)
        self.fc3 = nn.Linear(100, 4)
        self.dropout = nn.Dropout(0.2)
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.fc3(x)
        return x
    
net = Net()
# use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
# print where the model is running on (cpu or gpu)
print('running on', device)


In [None]:



# train model
batch_size = 100
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
epochs = 10
for e in range(epochs):
    running_loss = 0
    for images, labels in train_loader:
        optimizer.zero_grad()
        output = net(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(train_loader)}")

