# Fakenews detection
Karolina Mączka, Tymoteusz Urban

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import sklearn

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tymek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tymek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Getting to know the data

In [4]:
df=pd.read_csv('PreProcessedData.csv', header=0, sep=',')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,Ground Label
0,0,Ann Coulter Make Believes She Has ‘Gay Friend...,"It s hard to believe, but Donald Trump does ha...",fake
1,1,Rating: Moody‘s verbessert Ausblick für Russla...,bankensektor Der russische Staat werde die Ban...,fake
2,2,CAN WE ADD DIRTY MONEY ‘LAUNDERING’ To The Oba...,A member of the House Intelligence Committee i...,fake
3,3,Republicans on Obamacare repeal: 'We're going ...,WASHINGTON (Reuters) - House of Representative...,true
4,4,"Trump, on possible DACA deal, says border wall...",WASHINGTON (Reuters) - U.S. President Donald T...,true


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69045 entries, 0 to 69044
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    69045 non-null  int64 
 1   title         68365 non-null  object
 2   text          68978 non-null  object
 3   Ground Label  69045 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.1+ MB


In [7]:
# porządek
df = df[['title', 'text', 'Ground Label']]
df.rename(columns={'Ground Label': 'label'}, inplace=True)
df.head()

Unnamed: 0,title,text,label
0,Ann Coulter Make Believes She Has ‘Gay Friend...,"It s hard to believe, but Donald Trump does ha...",fake
1,Rating: Moody‘s verbessert Ausblick für Russla...,bankensektor Der russische Staat werde die Ban...,fake
2,CAN WE ADD DIRTY MONEY ‘LAUNDERING’ To The Oba...,A member of the House Intelligence Committee i...,fake
3,Republicans on Obamacare repeal: 'We're going ...,WASHINGTON (Reuters) - House of Representative...,true
4,"Trump, on possible DACA deal, says border wall...",WASHINGTON (Reuters) - U.S. President Donald T...,true


In [8]:
df['label'].value_counts()

fake    42159
true    26886
Name: label, dtype: int64

## Train test split

In [9]:
# Rozdzielamy zbiory do trenowania i testowania
X_rest, X_test, y_rest, y_test = train_test_split(df[['title', 'text']], df['label'], test_size=0.30, random_state=42)

In [8]:
y_rest.value_counts()

fake    29459
true    18872
Name: label, dtype: int64

In [9]:
X_train, X_validate, y_train, y_validate = train_test_split(X_rest, y_rest, test_size=0.30, random_state=42)

In [10]:
print(y_train.value_counts())
print(y_validate.value_counts())

fake    20645
true    13186
Name: label, dtype: int64
fake    8814
true    5686
Name: label, dtype: int64


stosunek ilości fake do true wszędzie jest taki sam, więc możemy przyjąć że jest to dobre rozdzielenie danych

In [None]:
testing_df = pd.concat([X_test, y_test], axis=1)
testing_df.to_csv('testing_data.csv', index='False')

In [11]:
# zapisanie ramki do walidacji
validate_df = pd.concat([X_validate, y_validate], axis=1)
validate_df.to_csv('fakenws_validation.csv', index='False')
validate_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14500 entries, 55024 to 57779
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   14368 non-null  object
 1   text    14486 non-null  object
 2   label   14500 non-null  object
dtypes: object(3)
memory usage: 453.1+ KB


In [30]:
df2 = pd.concat([X_train, y_train], axis=1)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33831 entries, 17623 to 8544
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   33497 non-null  object
 1   text    33800 non-null  object
 2   label   33831 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


## Data Preprocessing

### NaNs

In [31]:
# Jest bardzo mało nulli więc możemy się ich pozbyć
df2.dropna(inplace = True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33466 entries, 17623 to 8544
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   33466 non-null  object
 1   text    33466 non-null  object
 2   label   33466 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [None]:
# transformer function
def handleNa(df):
    df.dropna(inplace=True)
    return df

### Removing stopwords

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tymek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tymek\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
#stopword removal
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def remove_stopwords(s):
    s = [lemmatizer.lemmatize(word.lower()) for word in s if not word.lower() in stop]
    return s

In [None]:
#punctuation removal
def remove_punctuations(s):
    s = re.sub(r'[^\w\s]', '', s)
    return s

In [23]:
for col in ['title', 'text']:
    df2[col] = df2[col].apply(remove_punctuations)
    df2[col] = df2[col].apply(word_tokenize)

In [25]:
df2['title'] = df2['title'].apply(remove_stopwords)
df2['text'] = df2['text'].apply(remove_stopwords)

In [26]:
import textblob
from textblob import TextBlob
def subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity
def list2string(text):
    text = " ".join(wrd for wrd in text)
    return text

In [27]:
df3 = df2.copy()

In [29]:
df3["text"] = df3["text"].apply(list2string)
df3['title'] = df3['title'].apply(list2string)
df3.head()

Unnamed: 0,title,text,label
17623,western world respect law longer exists,western world respect law longer exists two im...,fake
61073,watch president obama unleashes trump say terr...,interview cbs face nation sunday morning presi...,fake
8102,guy punched moldylocks speaks violent antifa f...,nathan domingo founder identity europa guy see...,fake
67016,say amazon tell customer forced federal govern...,come limiting free speech nothing lawless gove...,fake
67779,antigun zealot katie couric hit 12 million def...,little miss sunshine fooled lot people year be...,fake


In [6]:
# transformer function
def removeStopwords(df):
    for col in df:
        df[col] = df[col].apply(remove_punctuations)
        df[col] = df[col].apply(word_tokenize)
        df[col] = df[col].apply(remove_stopwords)
        df[col] = df[col].apply(list2string)
    return df

In [None]:
### Extracting english

In [34]:
def deleteUrl(text):
    return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)

In [35]:
# usuwamy linki i puste teksty
df3['text'] = df3['text'].apply(deleteUrl)
df3['textlength'] = df3['text'].apply(len)

In [36]:
# tworzymy krótszą kolumnę do analizy języka
def shortenText(text):
    return text[:60]
df3['shorttext'] = df3['text'].apply(shortenText)


In [None]:
df3[df3['shorttext'].apply(len) < 10]

In [None]:
# usuwamy teksty które mają mniej niż 10 znaków - wszystkie to były typu 'watch this (some url)'
df3 = df3[df3['shorttext'].apply(len) > 10]

In [38]:
df3.head()

Unnamed: 0,title,text,label,textlength,shorttext
17623,western world respect law longer exists,western world respect law longer exists two im...,fake,237,western world respect law longer exists two im...
61073,watch president obama unleashes trump say terr...,interview cbs face nation sunday morning presi...,fake,1025,interview cbs face nation sunday morning presi...
8102,guy punched moldylocks speaks violent antifa f...,nathan domingo founder identity europa guy see...,fake,1310,nathan domingo founder identity europa guy see...
67016,say amazon tell customer forced federal govern...,come limiting free speech nothing lawless gove...,fake,940,come limiting free speech nothing lawless gove...
67779,antigun zealot katie couric hit 12 million def...,little miss sunshine fooled lot people year be...,fake,3260,little miss sunshine fooled lot people year be...


In [39]:
df4 = df3.copy()

In [41]:
from langdetect import detect
df4['lang'] = df4['shorttext'].apply(detect)

In [42]:
# robimy model który będzie wykrywał po angielsku
df4 = df4[df4['lang']=='en']

In [44]:
df4 = df4[['title', 'text', 'label']].reset_index(drop=True)
df4

Unnamed: 0,title,text,label
0,western world respect law longer exists,western world respect law longer exists two im...,fake
1,watch president obama unleashes trump say terr...,interview cbs face nation sunday morning presi...,fake
2,guy punched moldylocks speaks violent antifa f...,nathan domingo founder identity europa guy see...,fake
3,say amazon tell customer forced federal govern...,come limiting free speech nothing lawless gove...,fake
4,antigun zealot katie couric hit 12 million def...,little miss sunshine fooled lot people year be...,fake
...,...,...,...
29292,state attorney lied baltimore police probable ...,marilyn mosby held press conference today said...,fake
29293,russia denies plane killed civilian syria deir...,moscow reuters russia defence ministry denied ...,true
29294,hungary wall versus u wall,oped catherine j frompovich currently european...,fake
29295,pastor caught trump spiritual advisor stealing...,seems donald trump spiritual advisor exactly e...,fake


In [12]:
#transformer function
def dropLanguages(df):
    df['text'] = df['text'].apply(deleteUrl)
    df['shorttext'] = df['text'].apply(shortenText)
    df = df[df['shorttext'].apply(len) > 10]
    df['lang'] = df['shorttext'].apply(detect)
    df = df[df['lang']=='en']
    df = df.drop(columns=['shorttext', 'lang'])

In [45]:
# żeby zapisać dane
df4.to_csv('train_data.csv', index=False)

In [46]:
X_train2 = df4[['title', 'text']]

In [None]:
# Czy łączyć title i tekst? jeśli tak to przed czy po tych transformerach? A może wgl na samym początku? 

In [50]:
# Bag of words - liczba wystąpień każdego słowa w danym tekście
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(X_train2['text'])

In [53]:
# Zamiana częstości w częstotliwość wystąpień
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [52]:
X_train_tfidf.shape

(29297, 10089649)

## Pipeline

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

na_transformer = FunctionTransformer(handleNa)
stopwords_transformer = FunctionTransformer(removeStopwords)
language_transformer = FunctionTransformer(dropLanguages)

pipe = Pipeline([
    ("na", na_transformer),
    ("stopwords", stopwords_transformer),
    ("language", language_transformer),
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
])