In [37]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import re

import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchmetrics import Accuracy
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split


### Подгружаем данные и модель Word2Vec

In [4]:
df_fake = pd.read_csv('data/Fake.csv')
df_true = pd.read_csv('data/True.csv')
df_fake['target'] = 0
df_true['target'] = 1
df = pd.concat([df_fake, df_true], ignore_index=True)
df = df.drop(['subject', 'date'], axis=1)
print(df.shape)
df.head()

(44898, 3)


Unnamed: 0,title,text,target
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


In [3]:
#Загрузим модель W2V
model_w2v = Word2Vec.load('output/models/model_w2v.model')
len(model_w2v.wv.key_to_index)

32257

### Создадим датасеты для train и valid, а также сделаем CustomDataloader для нейронки

In [10]:
df_train, df_valid = train_test_split(df, test_size=0.3, train_size=0.7, random_state=42, shuffle=True)
df_train.shape, df_valid.shape

((31428, 3), (13470, 3))

#### Test tokenize raw data

In [53]:
def tokenize(text_: str)->list:
        text_ = text_.replace("\n", " ") 
        tokenized_text = []
        # Итерируемся по каждому предложению в фалйе
        for i in sent_tokenize(text_):
            temp = []
            # Токенизируем предложения в слова
            sentence = i.lower()
            sentence = re.sub('[^a-zA-Z]', ' ', sentence)
            sentence=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
            sentence=re.sub("(\\d|\\W)+"," ",sentence)
            
            for j in word_tokenize(sentence):
                temp.append(j.lower())
        
            tokenized_text.append(temp)
        return tokenized_text

In [94]:
def vectorize_text(row: pd.Series):
    series_list = []
    for idx in row.index:
        tokens = tokenize(row[idx])
        vector = np.zeros(model_w2v.vector_size)
        for sentence in tokens:
            for word in sentence:
                try:
                    vector += model_w2v.wv.get_vector(word)
                except:
                    pass
        
        features = pd.Series(data=vector, index=[idx+f'_{i}' for i in range(len(vector))])
        series_list.append(features)
        
    return pd.concat(series_list)

In [97]:
df_train[['title', 'text']].head(10).apply(vectorize_text, axis=1)

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,text_10,text_11,text_12,text_13,text_14,text_15,text_16,text_17,text_18,text_19
13970,-20.561163,1.705486,-2.813859,8.285939,-9.937233,36.796398,-0.300022,5.2739,5.686247,1.478673,...,-207.849375,-399.984856,128.510403,-106.04697,-1340.851631,332.76874,595.763102,-295.093025,-601.274971,-337.85213
41668,-39.902468,12.084899,-1.709602,-13.218677,-21.592071,-4.526084,-21.932969,1.423887,-1.258491,4.497511,...,79.655821,-28.165098,-99.04281,25.112562,-217.632894,-113.776709,-8.50068,-127.412714,-324.369708,-84.265618
26810,-48.529369,16.988462,0.704686,-2.346718,-13.877482,-7.049728,-13.525976,10.662228,13.997027,-3.210051,...,511.014646,298.127093,414.678216,103.258333,-384.669279,-409.957141,268.284043,-191.122515,-542.920393,70.823093
30967,-1.258231,14.229122,16.629794,5.558938,6.001331,38.216741,27.653911,6.158491,7.341925,2.906823,...,491.475686,-181.986034,67.300425,-266.28901,-209.300461,-355.221548,287.658472,-63.994527,144.846943,316.676905
26072,-42.545065,-6.151385,-44.947698,9.563522,-18.862579,-32.311658,4.719734,3.072655,15.712891,19.545215,...,-105.818652,76.588979,608.738456,488.648363,-525.601861,-112.235006,-9.093687,-1085.552581,-109.915153,-267.367391
7209,-26.003996,3.622691,19.844227,32.363963,7.756209,-20.215562,8.282713,12.835868,19.285243,-2.806621,...,230.882876,-31.972784,604.556259,264.453843,-518.395541,66.800142,264.832368,-433.785902,302.314514,265.590722
8575,-5.880494,21.249021,-1.470611,25.700341,-25.608693,19.32941,-8.853274,-21.062501,-10.99327,58.789575,...,191.248024,-392.947424,-14.272134,-65.698409,-516.6172,339.01604,450.008702,-9.012203,560.351363,112.212919
41384,-20.171666,-18.079981,6.122062,-10.063215,-2.608655,-0.822229,-16.906326,10.794742,10.210679,26.09218,...,161.258695,-236.590131,134.691902,-9.783641,38.266353,-140.475202,308.873206,-456.21343,-660.571145,92.515608
34361,-22.001598,-20.279675,-16.31998,18.673759,14.521915,-16.555194,15.947908,-13.456891,30.76441,19.614207,...,116.012522,452.386276,577.084155,218.051598,-421.430609,-458.197249,264.858352,-712.613166,383.070314,631.153352
29971,6.60097,1.272936,-11.103766,-26.819343,-4.800324,0.352383,28.931379,-26.712205,-10.233462,11.537558,...,1153.336848,821.638676,334.598121,-711.812696,-109.027652,-847.780918,627.2539,-638.205925,-886.325826,228.381205


#### Create CustomDataset

In [145]:
class CustomDataset(Dataset):

    def __init__(self, data_: pd.DataFrame, model_w2v_: Word2Vec):
        
        self.model = model_w2v_
        self.data = data_.reset_index(drop=True)
        
        self.vecorized_data = self.data[['title', 'text']].apply(self.vectorize_text, axis=1)
        self.target = self.data['target']
        self.columns = self.vecorized_data.columns.values
        return
    
    def tokenize(self, text_: str)->list:
        text_ = text_.replace("\n", " ") 
        tokenized_text = []
        # Итерируемся по каждому предложению в фалйе
        for i in sent_tokenize(text_):
            temp = []
            # Токенизируем предложения в слова
            sentence = i.lower()
            sentence = re.sub('[^a-zA-Z]', ' ', sentence)
            sentence=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
            sentence=re.sub("(\\d|\\W)+"," ",sentence)
            
            for j in word_tokenize(sentence):
                temp.append(j.lower())
        
            tokenized_text.append(temp)
        return tokenized_text
    
    def vectorize_text(self, row: pd.Series):
        series_list = []
        for idx in row.index:
            tokens = self.tokenize(row[idx])
            vector = np.zeros(self.model.vector_size)
            for sentence in tokens:
                for word in sentence:
                    try:
                        vector += self.model.wv.get_vector(word)
                    except:
                        pass
            
            features = pd.Series(data=vector, index=[idx+f'_{i}' for i in range(len(vector))])
            series_list.append(features)
            
        return pd.concat(series_list)

    def __len__(self):
        return len(self.vecorized_data)

    def __getitem__(self, idx):
        #row = self.vecorized_data.take([idx], axis=0)
        #row = {col: torch.tensor(row[col].values, dtype=torch.float32) for i, col in enumerate(self.columns)}
        row = self.vecorized_data.iloc[idx].values
        return row, np.float32(self.target[idx])

In [150]:
BATCH_SIZE = 64

In [151]:
#TRAIN
train_dataset = CustomDataset(df_train, model_w2v)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print('Create train dataset')

Create train dataset


In [152]:
#VALID
valid_dataset = CustomDataset(df_valid, model_w2v)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print('Create test dataset')

Create test dataset


In [153]:
#Save tokenized texts
with open('output/train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)

#Save tokenized texts
with open('output/valid_loader.pkl', 'wb') as f:
    pickle.dump(valid_loader, f)

### CNN

In [154]:
#Load tokenized texts
with open('output/train_loader.pkl', 'rb') as f:
    loaded_train_loader = pickle.load(f)

#Load tokenized texts
with open('output/valid_loader.pkl', 'rb') as f:
    loaded_valid_loader = pickle.load(f)

In [333]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [393]:
class CNN(nn.Module):
    def __init__(self, kernel=3, len_features=None):
        super().__init__()
        self.conv1 = nn.Conv1d(BATCH_SIZE, BATCH_SIZE, kernel)
        self.bn1 = nn.BatchNorm1d(len_features-kernel+1)
        self.fc1 = nn.Linear(len_features-kernel+1, int(len_features/2))
        self.fc2 = nn.Linear(int(len_features/2), int(len_features/4))
        self.fc3 = nn.Linear(int(len_features/4), 2)
        self.soft_max = nn.Softmax(1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.tanh(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.soft_max(x)
        return x

In [489]:
EPOCHS = 10
LEARNING_RATE = 0.001

model_cnn = CNN(kernel=10, 
                len_features=len(loaded_train_loader.dataset.columns)).float().to(device=device)

loss = torch.nn.BCELoss().to(device)
accuracy = Accuracy(task='binary').to(device)
optimizer = optim.Adam(model_cnn.parameters(), lr=LEARNING_RATE)

print(model_cnn)

step = 0
for epoch in range(EPOCHS):
    model_cnn.train()
    for features, label in loaded_train_loader:
        
        optimizer.zero_grad()
        output = model_cnn(features.float().to(device))
        
        # Calculate error and backpropagate
        loss_val = loss(output, torch.stack([label, ((label-1)*(-1)).int()]).T.to(device))
        output = torch.sigmoid(output).to(device)
        loss_val.backward()                
        acc = accuracy(torch.argmin(output, dim=1).cpu(), torch.tensor(label, dtype=torch.int64)).item()
        
        # Update weights with gradients
        optimizer.step()
        step += 1

        if step % 50 == 0:
            print('\nEPOCH %d STEP %d : train_loss: %f train_acc: %f' %(epoch, step, loss_val.item(), acc))
            #print(torch.argmin(output, dim=1).cpu(), torch.tensor(label, dtype=torch.int64), sep='\n')
            
    # Run validation
    running_loss = []
    valid_scores = []
    valid_labels = []
    model_cnn.eval()
    with torch.no_grad():
        for features, label in valid_loader:
            output = model_cnn(features.float().to(device))
            # Calculate error and backpropagate
            try:
                loss_val = loss(output, torch.stack([label, ((label-1)*(-1)).int()]).T.to(device))
            except ValueError:
                print(output, label)
            
            running_loss.append(loss_val.item())
            valid_scores.extend(torch.argmin(output, dim=1).cpu())
            valid_labels.extend(label)

    valid_accuracy = accuracy(torch.tensor(valid_scores), torch.tensor(valid_labels)).item()
    print('valid_loss: %f valid_acc: %f' % (np.mean(running_loss), valid_accuracy))    

CNN(
  (conv1): Conv1d(64, 64, kernel_size=(10,), stride=(1,))
  (bn1): BatchNorm1d(31, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=31, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (soft_max): Softmax(dim=1)
)


  acc = accuracy(torch.argmin(output, dim=1).cpu(), torch.tensor(label, dtype=torch.int64)).item()



EPOCH 0 STEP 50 : train_loss: 0.681552 train_acc: 0.593750

EPOCH 0 STEP 100 : train_loss: 0.693022 train_acc: 0.500000

EPOCH 0 STEP 150 : train_loss: 0.672920 train_acc: 0.593750

EPOCH 0 STEP 200 : train_loss: 0.704829 train_acc: 0.406250

EPOCH 0 STEP 250 : train_loss: 0.681679 train_acc: 0.562500

EPOCH 0 STEP 300 : train_loss: 0.652214 train_acc: 0.656250

EPOCH 0 STEP 350 : train_loss: 0.645225 train_acc: 0.609375

EPOCH 0 STEP 400 : train_loss: 0.635160 train_acc: 0.593750

EPOCH 0 STEP 450 : train_loss: 0.678392 train_acc: 0.562500
valid_loss: 0.577851 valid_acc: 0.697321

EPOCH 1 STEP 500 : train_loss: 0.517160 train_acc: 0.765625

EPOCH 1 STEP 550 : train_loss: 0.509399 train_acc: 0.718750

EPOCH 1 STEP 600 : train_loss: 0.551452 train_acc: 0.718750

EPOCH 1 STEP 650 : train_loss: 0.498925 train_acc: 0.734375

EPOCH 1 STEP 700 : train_loss: 0.523480 train_acc: 0.750000

EPOCH 1 STEP 750 : train_loss: 0.556316 train_acc: 0.718750

EPOCH 1 STEP 800 : train_loss: 0.520104 trai

In [435]:
def tokenize(text_: str)->list:
        text_ = text_.replace("\n", " ") 
        tokenized_text = []
        # Итерируемся по каждому предложению в фалйе
        for i in sent_tokenize(text_):
            temp = []
            # Токенизируем предложения в слова
            sentence = i.lower()
            sentence = re.sub('[^a-zA-Z]', ' ', sentence)
            sentence=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
            sentence=re.sub("(\\d|\\W)+"," ",sentence)
            
            for j in word_tokenize(sentence):
                temp.append(j.lower())
        
            tokenized_text.append(temp)
        return tokenized_text
    
def vectorize_text(row: pd.Series):
    series_list = []
    for idx in row.index:
        tokens = tokenize(row[idx])
        vector = np.zeros(model_w2v.vector_size)
        for sentence in tokens:
            for word in sentence:
                try:
                    vector += model_w2v.wv.get_vector(word)
                except:
                    pass
        
        features = pd.Series(data=vector, index=[idx+f'_{i}' for i in range(len(vector))])
        series_list.append(features)
        
    return pd.concat(series_list)

In [479]:
def predict(data: pd.DataFrame, model_cnn: nn.Module):
    vecorized_data = data[['title', 'text']].apply(vectorize_text, axis=1)
    input_vector = vecorized_data.values
    pred = model_cnn.cpu().forward(torch.tensor(input_vector).float())
    return torch.argmin(pred, dim=1).cpu().tolist()

In [505]:
df_to_test = df.sample(BATCH_SIZE)
df_to_test['predict'] = predict(df_to_test, model_cnn)
df_to_test

Unnamed: 0,title,text,target,predict
3873,Russia Brags About Helping Trump Win As Our E...,Russia and Vladimir Putin got what they wanted...,0,0
8555,SNL Parodies All-White Oscars In Brutal Smack...,"Last night, Saturday Night Live delivered a br...",0,0
33716,Senate proposal on encryption gives judges bro...,WASHINGTON (Reuters) - A bipartisan group of U...,1,1
10692,DONALD TRUMP JR Slams Kathy Griffin for Playin...,Donald Trump Jr slammed Kathy Griffin for play...,0,0
12284,TRUMP ADVISOR: “If election results are overtu...,,0,0
...,...,...,...,...
28271,"Trump, Koch brothers at odds over 'Trumpcare' ...",WASHINGTON (Reuters) - Republicans considering...,1,1
30683,Merkel silent on fourth term despite glowing w...,BERLIN (Reuters) - U.S. President Barack Obama...,1,1
5475,A Whopping 0% Of Black Voters In Ohio And Pen...,"That s right, according to the latest NBC News...",0,0
2226,Obama Just Gave The PERFECT Response To Donal...,When former President George W. Bush left offi...,0,0


In [506]:
acc = (df_to_test.target == df_to_test.predict).sum() / len(df_to_test) * 100
print(f'Точность на случайной выборке составила: {acc}%')

Точность на случайной выборке составила: 89.0625%
