In [None]:
# prompt: please install all the below using pip

#!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras clean_text pandarallel




In [61]:
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.lm import Vocabulary
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from cleantext import clean
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from pandarallel import pandarallel
import ast
pandarallel.initialize(progress_bar=True)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

import os
IN_COLAB = False
if os.getenv("COLAB_RELEASE_TAG"):
   IN_COLAB = True


INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to /home/kipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kipp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/kipp/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

# Part 1

### Task 1

In [5]:
#load data
dataPath = "../data/"
if IN_COLAB:
  dataPath = "/content/drive/MyDrive/"
nsdf = pd.read_csv(dataPath + "news_sample.csv")
nsdf = nsdf.reset_index(drop=True)  # Reset index??
nsdf_raw = nsdf
print(nsdf.info())   # Check column types and missing values



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        250 non-null    int64  
 1   id                250 non-null    int64  
 2   domain            250 non-null    object 
 3   type              238 non-null    object 
 4   url               250 non-null    object 
 5   content           250 non-null    object 
 6   scraped_at        250 non-null    object 
 7   inserted_at       250 non-null    object 
 8   updated_at        250 non-null    object 
 9   title             250 non-null    object 
 10  authors           170 non-null    object 
 11  keywords          0 non-null      float64
 12  meta_keywords     250 non-null    object 
 13  meta_description  54 non-null     object 
 14  tags              27 non-null     object 
 15  summary           0 non-null      float64
dtypes: float64(2), int64(2), object(12)
memory u

In [6]:
# unique lable values
unique_values = nsdf['type'].unique()
print(unique_values)

['unreliable' 'fake' 'clickbait' 'conspiracy' 'reliable' 'bias' 'hate'
 'junksci' 'political' nan 'unknown']


In [7]:
#nan and unknown removed as they seem useless when training a classifier
nsdf = nsdf.dropna(subset=['type'])
nsdf = nsdf.loc[nsdf['type']!='unknown']
newunique_values = nsdf['type'].unique()
print(newunique_values)


['unreliable' 'fake' 'clickbait' 'conspiracy' 'reliable' 'bias' 'hate'
 'junksci' 'political']


Cleaning and Preprocessing

In [8]:
def cleanText(data, column):
    def clean_text_help(text):
        if isinstance(text, str):
            # Remove excess whitespace
            text = re.sub(r"\s+", " ", text).strip()
            #replace dates
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])[-/.]?(0[1-9]|1[0-2])[-/.]?([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 1
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])\s([A-Za-z]{3})\s([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 2
            return clean(text, lower=True, no_line_breaks=True, no_punct=True, replace_with_url="<URL>", replace_with_email="<EMAIL>", replace_with_number="<NUM>", replace_with_digit="<NUM>")
        return text  # Return unchanged if not a string
    data[column] = data[column].parallel_apply(clean_text_help)  # Apply function
    return data

In [9]:
#Tokenize the text function
def tokenizeText(data, column):
    def tokenize_text_help(text):
        if isinstance(text, str):
            return word_tokenize(text)
        return text  # Return unchanged if not a string
    data[column] = data[column].parallel_apply(tokenize_text_help)  # Apply function
    return data

In [11]:
#function for removeing stopwords
def remove_stopwords(data, column):
    def remove_stopwords_help(text):
        stop_words = set(stopwords.words('english'))  # Load stopwords
        if isinstance(text, str):
            return [word for word in text.at[0, 'content'] if not word.lower() in stop_words]
        return text  # Return unchanged if not a string
    data[column] = data[column].parallel_apply(remove_stopwords_help)  # Apply function
    return data

In [13]:
#funtion for populating vocabulary
def populate_vocabulary(data):
    N = data.shape[0]  # Get the number of rows
    allWords = []
    for i in range(N):
        if isinstance(data.at[i, 'content'], str):  # Ensure it's a string
            allWords.append(data.at[i, 'content'])
    return Vocabulary(allWords, unk_cutoff=2)

#langsom kørertid men kunne ikke finde ud af det med apply. Nogne med en god ide??
# Can just use value_vounts()


In [14]:
#making frequency dictionary
def getFrequency(data, column, preTokenized):
    N = data.shape[0]  # Get the number of rows
    allWords = []
    if preTokenized != 1 and preTokenized != 0:
        return "Wrong preTokenize input"
    if preTokenized == 1:
        for text in data[column]:
            if isinstance(text, list):  # Ensure text is already tokenized (list of words)
                allWords.extend(text)
            elif isinstance(text, str):  # If still a string, split it as a fallback
                allWords.extend(text.split())
    elif preTokenized == 0:
        for text in data[column]:
            if isinstance(text, str):
                allWords.extend(word_tokenize(text))

    return FreqDist(allWords)

#Alternative? Returns a pandas series, with word and frequency, very fast.
def getFreq(data, column):
    return data[column].str.split().explode().value_counts()

In [15]:
#function for removeing stopwords
def dataStemming(data, column):
    ps = PorterStemmer()
    def dataStemming_help(text):
        if isinstance(text, str):
            return ps.stem(text)
        return text  # Return unchanged if not a string
    data[column] = data[column].parallel_apply(dataStemming_help)  # Apply function
    return data

In [16]:
# One big function to process data:
def processData(data, column):
    def apply_sequential_helper(functions):
        # assume type siganture of functions to be List[f : String -> string ]
        def inner(text):
            for f in functions:
                text = f(text)
            return text
        return inner

    def clean_text_help(text):
        if isinstance(text, str):
            # Remove excess whitespace
            text = re.sub(r"\s+", " ", text).strip()
            #replace dates
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])[-/.]?(0[1-9]|1[0-2])[-/.]?([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 1
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])\s([A-Za-z]{3})\s([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 2
            return clean(text, lower=True, no_line_breaks=True, no_numbers=True, no_emails=True, no_urls=True, no_punct=True, replace_with_url="__URL__", replace_with_email="__EMAIL__", replace_with_number="__NUM__", replace_with_digit="__NUM__")
        raise TypeError("Clean_text passed non-string")

    def tokenize_text_help(text):
        if isinstance(text, str):
            return pd.Series(word_tokenize(text))
        return text  # Return unchanged if not a string

    def remove_stopwords_help(text):
      # text is a Series[str]
        stop_words = set(stopwords.words('english'))  # Load stopwords
        #if isinstance(text, str):
        #    return [word for word in text.at[0, 'content'] if not word.lower() in stop_words]
        #return text  # Return unchanged if not a string
        return text[~text.isin(stop_words)]

    ps = PorterStemmer()
    def dataStemming_help(text):
        #if isinstance(text, str):
        #    return ps.stem(text)
        #return text  # Return unchanged if not a string
        if(isinstance(text, str)):
            return pd.Series(ps.stem(text))
        return text.apply(ps.stem)

    def type_cleaner(text):
        if isinstance(text, str):
            return pd.Series(text).to_list()
        return text.to_list()

    data[column] = data[column].parallel_apply(apply_sequential_helper(
        [clean_text_help, # str -> str
        tokenize_text_help, # str -> list[str]
        remove_stopwords_help, #series[str] -> series[str]
        dataStemming_help, #series[str] -> series[str]
        type_cleaner # series[str] -> series[str]
    ]))
    return data

In [17]:
"""
nsdf_cleaned = cleanText(nsdf, 'content')
nsdf_tokenized = tokenizeText(nsdf_cleaned, 'content')                  #tokenizing
nsdf_cleaned_tokenized_nostopwords = remove_stopwords(nsdf_tokenized, 'content')           #removing stopwords
nsdf_preprocessed = dataStemming(nsdf_cleaned_tokenized_nostopwords, 'content')
"""
nsdf_processed = processData(nsdf, 'content')
nsdf_processed.dropna(subset=['content'], inplace=True)  # Drop rows with no content
nsdf_processed.reset_index(drop=True, inplace=True)  # Reset index
print(nsdf_processed.at[0, 'content'])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=58), Label(value='0 / 58'))), HBox…

['sometim', 'power', 'christma', 'make', 'wild', 'wonder', 'thing', 'need', 'believ', 'holi', 'triniti', 'believ', 'posit', 'power', 'good', 'other', 'simpl', 'act', 'give', 'without', 'receiv', 'lost', 'mani', 'us', 'day', 'worri', 'money', 'success', 'hold', 'us', 'back', 'give', 'other', 'need', 'one', 'congreg', 'ohio', 'move', 'action', 'power', 'sermon', 'given', 'church', 'christma', 'eve', 'pastor', 'grand', 'lake', 'unit', 'methodist', 'church', 'celina', 'ohio', 'gave', 'emot', 'sermon', 'import', 'understand', 'messag', 'jesu', 'mani', 'religi', 'peopl', 'messag', 'jesu', 'help', 'other', 'make', 'sure', 'peopl', 'suffer', 'get', 'help', 'need', 'enjoy', 'life', 'littl', 'bit', 'sermon', 'realli', 'generos', 'look', 'like', 'live', 'jesu', 'live', 'long', 'time', 'ago', 'act', 'gener', 'fashion', 'time', 'would', 'gener', 'act', 'look', 'like', 'time', 'focu', 'sermon', 'potenc', 'sermon', 'lost', 'congreg', 'move', 'take', 'action', 'sermon', 'end', 'congreg', 'decid', 'tak

In [18]:
#word frequency pre preprocessing
print("word frequency pre preprocessing")
word_frequency_pre = getFreq(nsdf_raw, 'content').sum()
print(word_frequency_pre)

word frequency pre preprocessing
170462


In [19]:
#word frequency post preprocessing
print("word frequency post preprocessing")
# getfreq assumes different type signature, so we have to do it this way
word_frequency_post = nsdf_processed["content"].explode().value_counts().sum()
print(word_frequency_post)

word frequency post preprocessing
86069


In [19]:
#word frequency post stemming
#print("word frequency post stemming")
#word_frequency_postStem = getFrequency(nsdf_.processed, 'content', 1)
#print(sum(word_frequency_postStem.values()))

In [20]:
#pre = word_frequency_pre
#post = word_frequency_post
#print("Reduction rate of the vocabulary size after removing stopwords:")
#print(abs(pre -post))
#print("Further  reduction rate of the vocabulary size after stemming")
#postStem = sum(word_frequency_post.values())
#print(abs(post -postStem))

### Task 2

In [None]:
#load data
fakeNewsCorpus = pd.read_csv(dataPath + "995,000_rows.csv")
#Hva saten er den der unnamed???
print(fakeNewsCorpus.head())
#fakeNewsCorpus['content'].duplicated()
news_noDup = fakeNewsCorpus.drop_duplicates(subset=['content']).dropna(subset=['content']).reset_index(drop=True)

  Unnamed: 0         id               domain        type  \
0        732  7444726.0   nationalreview.com   political   
1       1348  6213642.0    beforeitsnews.com        fake   
2       7119  3867639.0     dailycurrant.com      satire   
3       1518  9560791.0          nytimes.com    reliable   
4       9345  2059625.0  infiniteunknown.net  conspiracy   

                                                 url  \
0  http://www.nationalreview.com/node/152734/%E2%...   
1  http://beforeitsnews.com/economy/2012/06/the-c...   
2  http://dailycurrant.com/2016/01/18/man-awoken-...   
3  https://query.nytimes.com/gst/fullpage.html?re...   
4  http://www.infiniteunknown.net/2011/09/14/100-...   

                                             content  \
0  Plus one article on Google Plus\n\n(Thanks to ...   
1  The Cost Of The Best Senate Banking Committee ...   
2  Man Awoken From 27-Year Coma Commits Suicide A...   
3  WHEN Julia Geist was asked to draw a picture o...   
4  – 100 Compiled Stud

In [None]:
#Cleaning
news_processed = processData(news_noDup, 'content')
news_processed.to_json(dataPath + "news_processed.json", orient='records', lines=True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=101614), Label(value='0 / 101614')…

### Task 3

In [None]:
# check if fakeNewsCorpus is defined
try:
    fakeNewsCorpus
    pass
except NameError:
    fakeNewsCorpus = pd.read_json(dataPath + "news_processed.json", orient='records', lines=True)
except:
    pass
finally:
    print(fakeNewsCorpus.info())   # Check column types and missing values
    fndf = fakeNewsCorpus.reset_index(drop=True)  # Reset index

In [18]:
print(fndf.loc[fndf['type']==''])

Empty DataFrame
Columns: [Unnamed: 0, id, domain, type, url, content, scraped_at, inserted_at, updated_at, title, authors, keywords, meta_keywords, meta_description, tags, summary, source]
Index: []


### Cleaning and Preprocessing

In [18]:
print("Pandas DataFrame:")
display(fndf.iloc[0])



Pandas DataFrame:


Unnamed: 0                                                      732.0
id                                                          7444726.0
domain                                             nationalreview.com
type                                                        political
url                 http://www.nationalreview.com/node/152734/%E2%...
content             ['plu', 'one', 'articl', 'googl', 'plu', 'than...
scraped_at                                 2017-11-27T01:14:42.983556
inserted_at                                2018-02-08 19:18:34.468038
updated_at                                 2018-02-08 19:18:34.468066
title                                              Iran News Round Up
authors                                                           NaN
keywords                                                          NaN
meta_keywords       ['National Review', 'National Review Online', ...
meta_description                                                  NaN
tags                

## Observations about dataset

In [None]:
relib_news = fndf.loc[fndf['type'] == '1']
fake_news = fndf.loc[fndf['type'] == '0']

print(relib_news.shape[0], " : ", fake_news.shape[0])

print("distribution for real news")
print(relib_news["content"].explode().value_counts())
print("distribution for fake news")
print(fake_news["content"].explode().value_counts())

print("distribution for all news")
print(fndf["content"].explode().value_counts())

0    ['plu', 'one', 'articl', 'googl', 'plu', 'than...
dtype: object

## Task 4

In [63]:
# Splitting into test, train and validation
X_train, X_valtest, y_train, y_valtest = train_test_split(fndf['content'], fndf['type'], test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=42)
# x_train = testing_ x, y_train = training_y
# (x_test
print("train size:", y_train.shape)
print("val size:", y_val.shape)
print("test size:", y_test.shape)


NameError: name 'fndf' is not defined

# Part 2

## Task 0, splitting labels into reliable and unreliable.

In [None]:
# unique lable values
unique_values = fndf['type'].unique()
print(unique_values)
#hard to know how to classify nan and unknown, so removed for now
# we also remove the a weird type 
fndf = fndf.dropna(subset=['type'])
fndf = fndf.loc[fndf['type']!='unknown']
# Need to reset index

newunique_values = fndf['type'].unique()
print(newunique_values)


# groups (reliable) as truenews 1 and (all others) in fakenews 0
    #note this is naive and should be reconsidered later

# reliable, clickbait and political are all, by their contents, factually correct (albeit possibly politcally motivated)
# we deem those to be "real" news
fndf['type'] = fndf['type'].replace(r'^(reliable|clickbait|political)$', '1', regex=True) 
fndf['type'] = fndf['type'].replace(r'^(?!1$).+', '0', regex=True)   # Replace everything except '1' with '0'
#fndf['type'] = fndf['type'].fillna('0')

newunique_values = fndf['type'].unique()
print(newunique_values)
fndf.shape[0]

## Task 1 - Simple linear regression model.

Get the top 10000 words, and how often they occur in each article

In [66]:
#Standarize fndf?

counts = nsdf['content'].explode().value_counts()
top = counts[:10000]

#print(pd.Series(nsdf["content"][0]).value_counts())
print(pd.Series(nsdf["content"][0]).value_counts()[top.keys()[0]])

def CountFreq(words, vocab):
    # words: Series[str]
    # vocab: list[str] of words to count
    words = pd.Series(words)
    new_row = pd.Series(np.zeros(len(vocab)))
    n = len(vocab)
    words_in_article = pd.Series(words).value_counts()
    return vocab.apply(lambda x: words_in_article.get(x, 0))

print(nsdf["content"])
rowsFreq = nsdf["content"].parallel_apply(lambda row: CountFreq(row, pd.Series(top.keys())))
print(rowsFreq)
print(rowsFreq.sum())


1
0      [sometim, power, christma, make, wild, wonder,...
1      [awaken, num, strand, dna, reconnect, movi, re...
2      [never, hike, alon, friday, 13th, fan, film, u...
3      [rare, shark, caught, scientist, left, blunder...
4      [donald, trump, unnerv, abil, abil, creat, rea...
                             ...                        
227    [disturb, sugar, daddysugar, babi, relationshi...
228    [press, news, ©️, peopl, power, news, ®️, wel,...
229    [senat, vote, late, thursday, start, work, leg...
230    [prison, rahm, god, work, mani, other, headlin...
231    [num, use, item, tini, home, headlin, bitcoin,...
Name: content, Length: 232, dtype: object


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=58), Label(value='0 / 58'))), HBox…

     0     1     2     3     4     5     6     7     8     9     ...  9990  \
0       1     1     2     3     4     3     0     0     0     1  ...     0   
1       9     1     0     1     0     1     0     1     1     0  ...     0   
2       7     0     0     1     0     1     0     0     0     0  ...     0   
3       4     2     0     2     0     2     0     0     0     2  ...     0   
4       9     0     0     1     2     1     0     5     0     0  ...     0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
227     1     0     0     0     0     0     0     1     0     0  ...     0   
228     4     0     1     0     1     1     1     0     2     0  ...     0   
229     1     0     0     0     0     0     0     0     0     2  ...     0   
230    18     5     0     3     7     1     2     1     4     8  ...     0   
231     1     2     0     0     0     1     0     1     6     0  ...     0   

     9991  9992  9993  9994  9995  9996  9997  9998  9999  
0  

Creating the linear regression

In [62]:
linReg = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)
linReg.fit(X_train, y_train)

y_pred = linReg.predict(X_test)
f1 = f1_score(y_test, y_pred)

# Print results
print(f"F1 Score: {f1:.4f}")
print(f"Hyperparameters: max_iter=1000, solver='liblinear', binary bag-of-words")


NameError: name 'X_train' is not defined