In [41]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.lm import Vocabulary
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from cleantext import clean
nltk.download('punkt') 
nltk.download('stopwords') 



[nltk_data] Downloading package punkt to /home/andreas-linus-thalund-
[nltk_data]     midtgaard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andreas-linus-
[nltk_data]     thalund-midtgaard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Part 1

### Task 1

In [42]:
#load data
dataPath = "../data/"
newsSample = pd.read_csv(dataPath + "news_sample.csv")
nsdf = pd.DataFrame(newsSample)
nsdf = nsdf.reset_index(drop=True)  # Reset index??
print(nsdf.info())   # Check column types and missing values



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        250 non-null    int64  
 1   id                250 non-null    int64  
 2   domain            250 non-null    object 
 3   type              238 non-null    object 
 4   url               250 non-null    object 
 5   content           250 non-null    object 
 6   scraped_at        250 non-null    object 
 7   inserted_at       250 non-null    object 
 8   updated_at        250 non-null    object 
 9   title             250 non-null    object 
 10  authors           170 non-null    object 
 11  keywords          0 non-null      float64
 12  meta_keywords     250 non-null    object 
 13  meta_description  54 non-null     object 
 14  tags              27 non-null     object 
 15  summary           0 non-null      float64
dtypes: float64(2), int64(2), object(12)
memory u

In [43]:
# unique lable values
unique_values = nsdf['type'].unique()
print(unique_values)

['unreliable' 'fake' 'clickbait' 'conspiracy' 'reliable' 'bias' 'hate'
 'junksci' 'political' nan 'unknown']


In [44]:
#nan and unknown removed as they seem useless when training a classifier
nsdf = nsdf.dropna(subset=['type'])
nsdf = nsdf.loc[nsdf['type']!='unknown']
newunique_values = nsdf['type'].unique()
print(newunique_values)


['unreliable' 'fake' 'clickbait' 'conspiracy' 'reliable' 'bias' 'hate'
 'junksci' 'political']


Cleaning and Preprocessing

In [45]:
def clean_text_help(text):
    if isinstance(text, str):
        return clean(text, lower=True, replace_with_url="<URL>", replace_with_email="<EMAIL>", replace_with_number="<NUMBER>")
    return text  # Return unchanged if not a string

def cleanText(data, column):
    data = data.reset_index(drop=True)  # Reset index
    data[column] = data[column].apply(clean_text_help)  # Apply function
    return data

In [46]:
nsdf_cleaned = cleanText(nsdf, 'content')

In [47]:
#Tokenize the text function
def tokenize_text_help(text):
    if isinstance(text, str):
        return word_tokenize(text)
    return text  # Return unchanged if not a string

def tokenizeText(data, column):
    data[column] = data[column].apply(tokenize_text_help)  # Apply function
    return data

In [48]:
#function for removeing stopwords
def remove_stopwords_help(text):
    stop_words = set(stopwords.words('english'))  # Load stopwords
    if isinstance(text, str):
        return [word for word in text.at[0, 'content'] if not word.lower() in stop_words]
    return text  # Return unchanged if not a string

def remove_stopwords(data, column):
    data[column] = data[column].apply(remove_stopwords_help)  # Apply function
    return data

In [49]:
#funtion for populating vocabulary
def populate_vocabulary(data):
    N = data.shape[0]  # Get the number of rows
    allWords = []
    for i in range(N):
        if isinstance(data.at[i, 'content'], str):  # Ensure it's a string
            allWords.append(data.at[i, 'content'])
    return Vocabulary(allWords, unk_cutoff=2)

#langsom k√∏rertid men kunne ikke finde ud af det med apply. Nogne med en god ide??


In [50]:
nsdf_tokenized = tokenizeText(nsdf_cleaned, 'content')                  #tokenizing
nsdf_preprocessed = remove_stopwords(nsdf_tokenized, 'content')           #removing stopwords
print(nsdf_preprocessed.at[0, 'content'])

['sometimes', 'the', 'power', 'of', 'christmas', 'will', 'make', 'you', 'do', 'wild', 'and', 'wonderful', 'things', '.', 'you', 'do', 'not', 'need', 'to', 'believe', 'in', 'the', 'holy', 'trinity', 'to', 'believe', 'in', 'the', 'positive', 'power', 'of', 'doing', 'good', 'for', 'others', '.', 'the', 'simple', 'act', 'of', 'giving', 'without', 'receiving', 'is', 'lost', 'on', 'many', 'of', 'us', 'these', 'days', ',', 'as', 'worries', 'about', 'money', 'and', 'success', 'hold', 'us', 'back', 'from', 'giving', 'to', 'others', 'who', 'are', 'in', 'need', '.', 'one', 'congregation', 'in', 'ohio', 'was', 'moved', 'to', 'action', 'by', 'the', 'power', 'of', 'a', 'sermon', 'given', 'at', 'their', 'church', 'on', 'christmas', 'eve', '.', 'the', 'pastor', 'at', 'grand', 'lake', 'united', 'methodist', 'church', 'in', 'celina', ',', 'ohio', 'gave', 'an', 'emotional', 'sermon', 'about', 'the', 'importance', 'of', 'understanding', 'the', 'message', 'of', 'jesus', '.', 'for', 'many', 'religious', 'pe

In [52]:
#size of vocabulary
vocabulary = populate_vocabulary(nsdf_preprocessed)

### Task 2

In [None]:
#load data

fakeNewsCorpus = pd.read_csv(dataPath + "995,000_rows.csv")
#Hva saten er den der unnamed???
print(fakeNewsCorpus.head())

Visualization

### Task 3

In [43]:
print(fakeNewsCorpus.info())   # Check column types and missing values
fndf = pd.DataFrame(fakeNewsCorpus)
fndf = fndf.reset_index(drop=True)  # Reset index??


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995000 entries, 0 to 994999
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        994999 non-null  object 
 1   id                994993 non-null  object 
 2   domain            994989 non-null  object 
 3   type              947214 non-null  object 
 4   url               994989 non-null  object 
 5   content           994988 non-null  object 
 6   scraped_at        994987 non-null  object 
 7   inserted_at       994987 non-null  object 
 8   updated_at        994987 non-null  object 
 9   title             986394 non-null  object 
 10  authors           552243 non-null  object 
 11  keywords          0 non-null       float64
 12  meta_keywords     956210 non-null  object 
 13  meta_description  469894 non-null  object 
 14  tags              230919 non-null  object 
 15  summary           0 non-null       float64
 16  source            21

### Cleaning and Preprocessing

In [48]:
print("Pandas DataFrame:")
display(fndf.iloc[0])


Pandas DataFrame:


Unnamed: 0                                                        732
id                                                          7444726.0
domain                                             nationalreview.com
type                                                        political
url                 http://www.nationalreview.com/node/152734/%E2%...
content             Plus one article on Google Plus\n\n(Thanks to ...
scraped_at                                 2017-11-27T01:14:42.983556
inserted_at                                2018-02-08 19:18:34.468038
updated_at                                 2018-02-08 19:18:34.468066
title                                              Iran News Round Up
authors                                                           NaN
keywords                                                          NaN
meta_keywords       ['National Review', 'National Review Online', ...
meta_description                                                  NaN
tags                

In [19]:
# unique lable values
unique_values = fndf['type'].unique()
print(unique_values)

['political' 'fake' 'satire' 'reliable' 'conspiracy' 'unreliable' 'bias'
 'rumor' 'unknown' nan 'clickbait' 'hate' 'junksci'
 '2018-02-10 13:43:39.521661']


In [None]:
#hard to know how to classify nan and unknown, so removed for now
fndf = fndf.dropna(subset=['type'])
fndf = fndf.loc[fndf['type']!='unknown']

newunique_values = fndf['type'].unique()
print(newunique_values)

['political' 'fake' 'satire' 'reliable' 'conspiracy' 'unreliable' 'bias'
 'rumor' 'clickbait' 'hate' 'junksci' '2018-02-10 13:43:39.521661']


In [51]:
# groups (reliable) as truenews 1 and (all others) in fakenews 0 
    #note this is naive and should be reconsidered later
fndf['type'] = fndf['type'].replace(r'^reliable$', '1', regex=True)  # Only replaces exact 'reliable' with 1
fndf['type'] = fndf['type'].replace(r'^(?!1$).+', '0', regex=True)   # Replace everything except '1' with '0'
#fndf['type'] = fndf['type'].fillna('0')

newunique_values = fndf['type'].unique()
print(newunique_values)
fndf.shape[0]


['0' '1']


903680

## Task 4

In [61]:
# Splitting into test, train and validation
X_train, X_valtest, y_train, y_valtest = train_test_split(fndf['content'], fndf['type'], test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=42)

print("train size:", y_train.shape)
print("val size:", y_val.shape)
print("test size:", y_test.shape)


train size: (722944,)
val size: (90368,)
test size: (90368,)


# Part 2