In [119]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/ray/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ray/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ray/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Merging data 

In [120]:
fake_news = pd.read_csv("Fakenews.csv",  encoding='utf-8',   encoding_errors='replace')
real_news = pd.read_csv("south_african_news_articles.csv")

In [121]:
# dropping duplicates
fake_news.drop_duplicates()
real_news.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,title,content,source,label
0,0,JMPD officials suspended after death of EFF c...,An armoured security guards vehicle w...,Times,1
1,1,"Hawks seize rifles, pistols at 'firearms trai...",Hawks confiscated firearms and ammuni...,Times,1
2,2,Acsa chief information officer suspended over...,ACSA has placed their chief informati...,Times,1
3,3,ID refers Matshela Koko magistrate for invest...,Former Eskom boss Matshela Koko celeb...,Times,1
4,4,Ace Magashule's former PA back in SA after ex...,Ace Magashule's former personal assis...,Times,1
...,...,...,...,...,...
2900,2900,Broad political spectrum in unity government a...,Chief Justice Raymond Zondo on Wednesday swore...,Mail and Guardian,1
2901,2901,The growing popularity of cricket in South Africa,Cricket has always been a beloved sport in Sou...,Mail and Guardian,1
2902,2902,Not so sweet: Sugar industry stakeholders figh...,A disagreement has erupted in the sugar indust...,Mail and Guardian,1
2903,2903,Woman swallowed whole by python in Indonesia,A woman was found dead inside the belly of a s...,Mail and Guardian,1


In [122]:
# merging the two data frams
combined = pd.concat([fake_news, real_news], axis=0, ignore_index=True)

combined.head()

Unnamed: 0.1,title,content,source,label,Unnamed: 0
0,Pitso Burst Into Fight With His Player Agent O...,It?s not a secret that Sibusiso Vilakazi is c...,hinnews.com,0,
1,Pretoria police caught in human trafficking an...,"According to report, Some criminals may have ...",hinnews.com,0,
2,Kaizer Chiefs Players Reveals Why Chiefs Playe...,Ex-Kaizer Chiefs hardman Tinashe Nengomasha h...,hinnews.com,0,
3,Malema and Ndlozi to be prosecuted for assault...,Lobby group AfriForum announced in a statemen...,hinnews.com,0,
4,Woman survives after being shot 11 times by he...,A Detroit woman is lucky to be alive after sh...,hinnews.com,0,


In [123]:
combined.drop(["Unnamed: 0"], inplace=True, axis=1)

combined.head()

Unnamed: 0,title,content,source,label
0,Pitso Burst Into Fight With His Player Agent O...,It?s not a secret that Sibusiso Vilakazi is c...,hinnews.com,0
1,Pretoria police caught in human trafficking an...,"According to report, Some criminals may have ...",hinnews.com,0
2,Kaizer Chiefs Players Reveals Why Chiefs Playe...,Ex-Kaizer Chiefs hardman Tinashe Nengomasha h...,hinnews.com,0
3,Malema and Ndlozi to be prosecuted for assault...,Lobby group AfriForum announced in a statemen...,hinnews.com,0
4,Woman survives after being shot 11 times by he...,A Detroit woman is lucky to be alive after sh...,hinnews.com,0


In [135]:
# checking for nan values
combined.isna().sum()


title        0
content      0
source       0
label        0
full_text    0
dtype: int64

since there is limited data in the dataset, i am gonna make a new column called full_text which will be the title and content combined. I will then clean this field 

#### Making Full_text column 

In [134]:
combined["full_text"] = combined["title"] + "." + combined["content"]
combined["full_text"]

0       Pitso Burst Into Fight With His Player Agent O...
1       Pretoria police caught in human trafficking an...
2       Kaizer Chiefs Players Reveals Why Chiefs Playe...
3       Malema and Ndlozi to be prosecuted for assault...
4       Woman survives after being shot 11 times by he...
                              ...                        
3715    Broad political spectrum in unity government a...
3716    The growing popularity of cricket in South Afr...
3717    Not so sweet: Sugar industry stakeholders figh...
3718    Woman swallowed whole by python in Indonesia.A...
3719    From water to police: Senzo Mchunu’s legacy an...
Name: full_text, Length: 3710, dtype: object

In [136]:
# double checking nan values 
combined.isna().sum()

title        0
content      0
source       0
label        0
full_text    0
dtype: int64

## Cleaning the DataFrame

### Text Normalizing 

In [126]:
normalize = lambda document: document.lower()

### Removing unwanted characters 

In [127]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def remove_unwanted(document):

    # remove user mentions
    document = re.sub("@[A-Za-z0-9_]+"," ", document)
    # remove URLS
    document = re.sub(r'http\S+', ' ', document)
    # remove hashtags
    document = re.sub("#[A-Za-z0-9_]+","", document)
    # remove emoji's
    document = remove_emoji(document)
    # remove punctuation
    document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    document = document.replace('  ',"")
    
    return document.strip()

### Removing stop words

In [128]:
def remove_words(tokens):
    stopwords = nltk.corpus.stopwords.words('english') # also supports german, spanish, portuguese, and others!
    stopwords = [remove_unwanted(word) for word in stopwords] # remove puntcuation from stopwords
    cleaned_tokens = [token for token in tokens if token not in stopwords]
    return cleaned_tokens

### Lemmatization

In [129]:
lemma = WordNetLemmatizer()

def lemmatize(tokens):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

### Stemming

In [130]:
stem = PorterStemmer()

def stemmer(tokens):
    stemmed_tokens = [stem.stem(token) for token in tokens]
    return stemmed_tokens

### Pipeline 

In [131]:
def pipeline(document, rule = 'lemmatize'):
    # first lets normalize the document
    document = normalize(document)
    # now lets remove unwanted characters
    document = remove_unwanted(document)
    # create tokens
    tokens = document.split()
    # remove unwanted words
    tokens = remove_words(tokens)
    # lemmatize or stem or
    if rule == 'lemmatize':
        tokens = lemmatize(tokens)
    elif rule == 'stem':
        tokens = stemmer(tokens)
    else:
        print(f"{rule} Is an invalid rule. Choices are 'lemmatize' and 'stem'")
    
    return " ".join(tokens)

In [137]:
df_clean = combined.copy()

df_clean["full_text"] = df_clean["full_text"] .apply(lambda doc: pipeline(doc))
df_clean.head()

Unnamed: 0,title,content,source,label,full_text
0,Pitso Burst Into Fight With His Player Agent O...,It?s not a secret that Sibusiso Vilakazi is c...,hinnews.com,0,pitso burst fight player agent players decisio...
1,Pretoria police caught in human trafficking an...,"According to report, Some criminals may have ...",hinnews.com,0,pretoria police catch human traffic kidnap acc...
2,Kaizer Chiefs Players Reveals Why Chiefs Playe...,Ex-Kaizer Chiefs hardman Tinashe Nengomasha h...,hinnews.com,0,kaizer chiefs players reveal chiefs players su...
3,Malema and Ndlozi to be prosecuted for assault...,Lobby group AfriForum announced in a statemen...,hinnews.com,0,malema ndlozi prosecute assault police officer...
4,Woman survives after being shot 11 times by he...,A Detroit woman is lucky to be alive after sh...,hinnews.com,0,woman survive shoot 11 time wife detroit woman...


In [139]:
df_clean.to_csv("Clean_News.csv", index=False)