# MIND_textCNN Part1

### Import package

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable

import tqdm
import os
import time
import re
import numpy as np
import pandas as pd
import string
import time
import random
import snowballstemmer
import collections
from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from itertools import chain
from sklearn.metrics import accuracy_score

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

import matplotlib.pyplot as plt 
%matplotlib inline

### Import data

In [70]:
train_news = pd.read_csv("data/Small_data_set/MINDsmall_train/news.tsv", sep="\t", header=None)
train_behaviors = pd.read_csv("data/Small_data_set/MINDsmall_train/behaviors.tsv", sep="\t", header=None)

dev_news = pd.read_csv("data/Small_data_set/MINDsmall_dev/news.tsv", sep="\t", header=None)
dev_behaviors = pd.read_csv("data/Small_data_set/MINDsmall_dev/behaviors.tsv", sep="\t", header=None)

In [71]:
columns = ["News_ID","Category","Subcategory","News_Title","News_Abstrct",
           "News_Url","Entities_in_News_Title","Entities_in_News_Abstract"]
train_news.columns = columns
train_news.head(3)

Unnamed: 0,News_ID,Category,Subcategory,News_Title,News_Abstrct,News_Url,Entities_in_News_Title,Entities_in_News_Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [72]:
columns = ["Impression ID","User ID","Impression Time","User Click History","Impression New"]
train_behaviors.columns = columns
train_behaviors.head(3)

Unnamed: 0,Impression ID,User ID,Impression Time,User Click History,Impression New
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...


# Cleaning Data

In [73]:
def nlp_preprocessing(text):
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    # Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


example = "Facebook and Google news law passed in Australia"
nlp_preprocessing(text=example)

'facebook google news law passed australia'

### train_news  (news)

In [74]:
train_news.head(1)

Unnamed: 0,News_ID,Category,Subcategory,News_Title,News_Abstrct,News_Url,Entities_in_News_Title,Entities_in_News_Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]


In [75]:
# sum Category, Subcategory, News_Title, and News_Abstrct into "body"
train_news['body'] = train_news["Category"] + " " +train_news["Subcategory"]  + " " + train_news["News_Title"]  + " " +train_news["News_Abstrct"]
news = train_news[['News_ID','body']]
news.head(3)

Unnamed: 0,News_ID,body
0,N55528,lifestyle lifestyleroyals The Brands Queen Eli...
1,N19639,health weightloss 50 Worst Habits For Belly Fa...
2,N61837,news newsworld The Cost of Trump's Aid Freeze ...


### train_behaviors (behaviors)

In [76]:
train_behaviors.head(1)

Unnamed: 0,Impression ID,User ID,Impression Time,User Click History,Impression New
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0


In [118]:
behaviors = train_behaviors.copy()
behaviors['Impression New'] = behaviors['Impression New'].apply(lambda x: x.split())
behaviors = behaviors.explode('Impression New')
behaviors['label'] = behaviors['Impression New'].apply(lambda x: x.split("-")[1])
behaviors['Impression New'] = behaviors['Impression New'].apply(lambda x: x.split("-")[0])

In [119]:
behaviors.head(3)

Unnamed: 0,Impression ID,User ID,Impression Time,User Click History,Impression New,label
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689,1
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N35729,0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678,0


### Treatment of imbalance sample 
Another important feature of the MIND data is the imbalance category sample, with a very low positive sample rate of only 4%.

There are many strategies to deal with samples with  imbalance category sample , such as

- Positive sample weighting
- Positive sample oversampling
- Negative sample downsampling

Considering the training sample size is very large,  the model calculation is time-consuming, and other factors and experimental results. We used a random sampling of negative samples without put-back, dividing the negative samples of the original training data into 5 parts, and constructing 5 different training datasets with all positive samples, Dataset0-4, each with a positive sample rate of about 16.5%.

In [120]:
Positive_Sample_Rate = behaviors.groupby("label").count()
Positive_Sample_Rate = Positive_Sample_Rate/behaviors.shape[0]
Positive_Sample_Rate = Positive_Sample_Rate.reset_index()
Positive_Sample_Rate = Positive_Sample_Rate.iloc[:,0:2]
Positive_Sample_Rate.columns = ['label','percentage']
Positive_Sample_Rate

Unnamed: 0,label,percentage
0,0,0.959554
1,1,0.040446


### Shuffle the behaviors_0 data to 10 part

In [131]:
behaviors['label'] = behaviors['label'].astype('int')
behaviors_1 = behaviors[behaviors['label']==1]
behaviors_0 = behaviors[behaviors['label']==0]

from sklearn.utils import shuffle
behaviors_0 = shuffle(behaviors_0, random_state=0)

In [138]:
count = int(behaviors_0.shape[0]/10)
dataset0 = behaviors_0.iloc[0:count,:]
dataset1 = behaviors_0.iloc[count*1:count*2,:]
dataset2 = behaviors_0.iloc[count*2:count*3,:]
dataset3 = behaviors_0.iloc[count*3:count*4,:]
dataset4 = behaviors_0.iloc[count*4:count*5,:]

dataset0 = pd.concat([dataset0, behaviors_1])

In [140]:
Positive_Sample_Rate_0 = dataset0.groupby("label").count()
Positive_Sample_Rate_0 = Positive_Sample_Rate_0/dataset0.shape[0]
Positive_Sample_Rate_0 = Positive_Sample_Rate_0.reset_index()
Positive_Sample_Rate_0 = Positive_Sample_Rate_0.iloc[:,0:2]
Positive_Sample_Rate_0.columns = ['label','percentage']
Positive_Sample_Rate_0

Unnamed: 0,label,percentage
0,0,0.703478
1,1,0.296522


### Combine news data into log behaviors

In [144]:
news.head(1)

Unnamed: 0,News_ID,body
0,N55528,lifestyle lifestyleroyals The Brands Queen Eli...


In [148]:
from sklearn.utils import shuffle
dataset0 = shuffle(dataset0, random_state=0)
dataset0.head(3)

Unnamed: 0,Impression ID,User ID,Impression Time,User Click History,Impression New,label
18714,18715,U30284,11/10/2019 12:39:52 PM,N22279 N39074 N20263 N39690 N10977 N50985 N240...,N48657,1
45814,45815,U33584,11/11/2019 10:14:59 PM,N28936 N45020 N55925 N48369 N15270 N7930 N2959...,N62395,1
2971,2972,U38515,11/13/2019 8:36:50 AM,N1150 N55846 N1569 N40962 N36739 N8419 N5696 N...,N13907,0


In [153]:
dataset0['news_lst'] = dataset0['User Click History'] + " " + dataset0['Impression New']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset0['news_lst'] = dataset0['User Click History'] + " " + dataset0['Impression New']


In [160]:
news.to_csv("news.csv", encoding="utf-8") 
dataset0.to_csv("dataset0.csv", encoding="utf-8") 

### word embedding

Since this dataset is very small, there is a risk of overfitting if we do word embedding with this dataset, and the model will not be generalized, so we pass in a pre-trained word embedding. The data used glove-twitter-100 with 387MB and 100 dimensions.

In [None]:
data0 = dataset0[['news_lst','label']]

(797054, 7)