#### 1. Importing libraries and datasets

In [287]:
# Import libraries

# Basic libraries
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline


#NLTK libraries
import re
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer


#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rimay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [288]:
# Read the datasets

df_onion = pd.read_csv('./theonion.csv') # fake news
df_not_onion = pd.read_csv('./nottheonion.csv') # true news

In [289]:
# print shape of the datasets - fake news

print ("The shape of the  data is (row, column):"+ str(df_onion.shape))
print (df_onion.info())

The shape of the  data is (row, column):(1343, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1343 entries, 0 to 1342
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subreddit     1343 non-null   object
 1   author        1343 non-null   object
 2   domain        1343 non-null   object
 3   title         1343 non-null   object
 4   num_comments  1343 non-null   int64 
 5   score         1343 non-null   int64 
 6   timestamp     1343 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 73.6+ KB
None


In [290]:
# print shape of the datasets - True news

print ("The shape of the  data is (row, column):"+ str(df_not_onion.shape))
print (df_not_onion.info())

The shape of the  data is (row, column):(9997, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9997 entries, 0 to 9996
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subreddit     9997 non-null   object
 1   author        9997 non-null   object
 2   domain        9997 non-null   object
 3   title         9997 non-null   object
 4   num_comments  9997 non-null   int64 
 5   score         9997 non-null   int64 
 6   timestamp     9997 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 546.8+ KB
None


From the information above, the dataset contain 1,343 of fake news while other dataset consist of 9,997 true news for the period 1 January 2021 to 31 December 2021.

Description of columns:

* Subreddit - forum of the topic of `r/theonion` or `r/nottheonion`
* author - the person who make the post
* domain - the url link to the contains of the news
* title - contains the news headlines
* num_comments - no. of comments made by the readers
* score - number of upvotes minus the number of downvotes
* timestamp - the date of the news posted

#### 2. Text Preprocessing

We need to perform the following text cleaning:

##### 2a. Checking the datasets

In [291]:
# Convert Unix Timestamp to Datetime
df_onion['timestamp'] = pd.to_datetime(df_onion['timestamp'], unit='s').dt.date
df_not_onion['timestamp'] = pd.to_datetime(df_not_onion['timestamp'], unit='s').dt.date

# Show date-range of posts scraped from r/TheOnion and r/nottheonion
print("TheOnion start date:", df_onion['timestamp'].min())
print("TheOnion end date:", df_onion['timestamp'].max())
print("Nottheonion start date:", df_not_onion['timestamp'].min())
print("Nottheonion end date:", df_not_onion['timestamp'].max())

TheOnion start date: 2021-01-01
TheOnion end date: 2021-12-31
Nottheonion start date: 2021-10-29
Nottheonion end date: 2021-12-31


In [292]:
# Checking nulls

pd.DataFrame([df_onion.isnull().sum(),df_not_onion.isnull().sum()], index=["TheOnion","notheonion"]).T

Unnamed: 0,TheOnion,notheonion
subreddit,0,0
author,0,0
domain,0,0
title,0,0
num_comments,0,0
score,0,0
timestamp,0,0


In [293]:
# Drop duplicate rows

df_onion.drop_duplicates(subset='title', inplace=True)

In [294]:
# Drop duplicate rows

df_not_onion.drop_duplicates(subset='title', inplace=True)

##### 2b. Title-Punctuation Cleaning

In [295]:
#  Make text lowercase, remove text in square brackets,remove links,remove punctuation,extra space and remove words containing numbers.

def text_cleaning(text):

    text = str(text).lower()
    text = re.sub('\[^\w\s]', '', text)
    text = re.sub('\[^A-Za-z]', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>''+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(' +', ' ', text)

    return text

In [296]:
df_onion['title']=df_onion['title'].apply(lambda x:text_cleaning(x))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp
0,TheOnion,mothershipq,theonion.com,surgeon kind of pissed patient seeing her defo...,0,1,2021-12-31
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked from building drivethrough ...,1,1,2021-12-31
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes it delights at...,0,1,2021-12-31
4,TheOnion,dwaxe,theonion.com,homeowner trying to smoke out snakes accidenta...,0,1,2021-12-31


In [297]:
df_not_onion['title']=df_not_onion['title'].apply(lambda x:text_cleaning(x))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp
0,nottheonion,Taco_duck68,wral.com,man attempts to pay for car with rap steals po...,0,1,2021-12-31
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past as sex worker ...,1,1,2021-12-31
2,nottheonion,Lopsided_File_1642,facebook.com,log into facebook,1,1,2021-12-31
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked from building drivethrough ...,0,1,2021-12-31
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies at on new y...,0,1,2021-12-31


##### 2c. Title-Stop words removing

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query. We would not want these words to take up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to stop words. NLTK(Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages. [Source](https://www.geeksforgeeks.org/removing-stop-words-nltk-python/) : Geeks for Geeks

In [298]:
stop = stopwords.words('english')
df_onion['title'] = df_onion['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31


In [299]:
stop = stopwords.words('english')
df_not_onion['title'] = df_not_onion['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31


##### 2d. Title - Tokenization

Tokenization is the process of splitting a text object into smaller units known as tokens. Examples of tokens can be words, characters, numbers, symbols, or n-grams

In [300]:
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [301]:
df_onion['tokens'] = df_onion['title'].apply(lambda msg : tokenize(msg))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31,"[surgeon, kind, pissed, patient, seeing, defor..."
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ..."
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31,"[gwyneth, paltrow, touts, new, diamondencruste..."
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31,"[artist, crafting, music, box, hopes, delights..."
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31,"[homeowner, trying, smoke, snakes, accidentall..."


In [302]:
df_not_onion['tokens'] = df_not_onion['title'].apply(lambda msg : tokenize(msg))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31,"[man, attempts, pay, car, rap, steals, potato,..."
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31,"[former, ufc, fighter, reveals, past, sex, wor..."
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31,"[log, facebook]"
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ..."
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31,"[legendary, actress, betty, white, dies, new, ..."


In [303]:
# Instantiate tokenizer.
tokenizer = RegexpTokenizer('[a-z]\w+')

# Run tokenizer.
def tokenize_1(text):
    text = tokenizer.tokenize(text)
    return [x.lower() for x in text]

In [304]:
df_onion['tokens_1'] = df_onion['title'].apply(lambda msg : tokenize_1(msg))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31,"[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor..."
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro..."
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31,"[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, touts, new, diamondencruste..."
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31,"[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hopes, delights..."
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31,"[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snakes, accidentall..."


In [305]:
df_not_onion['tokens_1'] = df_not_onion['title'].apply(lambda msg : tokenize_1(msg))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31,"[man, attempts, pay, car, rap, steals, potato,...","[man, attempts, pay, car, rap, steals, potato,..."
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31,"[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor..."
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31,"[log, facebook]","[log, facebook]"
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro..."
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31,"[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dies, new, ..."


##### 2e. Title - Lemmanization

Lemmanization is the process of reducing the word to its root stem for example run, running, runs, runed derived from the same word as run. Basically lemmanise is similiar to steamming which removes the prefix or suffix from word like ing, s, es, etc. NLTK library is used to lemmmanise the words. 

In [306]:

def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [307]:
# Apply lemmatization on tokens

df_onion['lemma_words'] = df_onion['tokens_1'].apply(lambda x : lemmatize(x))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31,"[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor..."
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro..."
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31,"[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, tout, new, diamondencrusted..."
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31,"[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hope, delight, ..."
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31,"[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snake, accidentally..."


In [308]:
df_not_onion['lemma_words'] = df_not_onion['tokens_1'].apply(lambda x : lemmatize(x))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31,"[man, attempts, pay, car, rap, steals, potato,...","[man, attempts, pay, car, rap, steals, potato,...","[man, attempt, pay, car, rap, steal, potato, c..."
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31,"[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor..."
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31,"[log, facebook]","[log, facebook]","[log, facebook]"
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro..."
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31,"[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dy, new, ye..."


##### 2f. Convert words to feature vectors

In [309]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [310]:
df_onion['clean_text'] = df_onion['lemma_words'].apply(lambda x : return_sentences(x))
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words,clean_text
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31,"[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor...",surgeon kind pissed patient seeing deformed fa...
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro...",mcdonald blocked building drivethrough rome an...
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31,"[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, tout, new, diamondencrusted...",gwyneth paltrow tout new diamondencrusted trep...
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31,"[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hope, delight, ...",artist crafting music box hope delight least o...
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31,"[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snake, accidentally...",homeowner trying smoke snake accidentally burn...


In [311]:
df_not_onion['clean_text'] = df_not_onion['lemma_words'].apply(lambda x : return_sentences(x))
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words,clean_text
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31,"[man, attempts, pay, car, rap, steals, potato,...","[man, attempts, pay, car, rap, steals, potato,...","[man, attempt, pay, car, rap, steal, potato, c...",man attempt pay car rap steal potato chip truck
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31,"[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor...",former ufc fighter reveals past sex worker fun...
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31,"[log, facebook]","[log, facebook]","[log, facebook]",log facebook
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro...",mcdonald blocked building drivethrough rome an...
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31,"[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dy, new, ye...",legendary actress betty white dy new year eve ...


##### 2g. Drop the row if clean_text with NA

In [312]:
def change_to_nan (value):
    if value == '':
        print (value)
        return np.nan
    else:
        return value

In [313]:
df_onion['clean_text'] = df_onion['clean_text'].apply(change_to_nan)







In [314]:
df_onion.dropna(subset=['clean_text'], how='all',inplace=True)
df_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words,clean_text
0,TheOnion,mothershipq,theonion.com,surgeon kind pissed patient seeing deformed fa...,0,1,2021-12-31,"[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor...","[surgeon, kind, pissed, patient, seeing, defor...",surgeon kind pissed patient seeing deformed fa...
1,TheOnion,-ImYourHuckleberry-,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,1,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro...",mcdonald blocked building drivethrough rome an...
2,TheOnion,dwaxe,theonion.com,gwyneth paltrow touts new diamondencrusted tre...,0,1,2021-12-31,"[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, touts, new, diamondencruste...","[gwyneth, paltrow, tout, new, diamondencrusted...",gwyneth paltrow tout new diamondencrusted trep...
3,TheOnion,dwaxe,theonion.com,artist crafting music box hopes delights least...,0,1,2021-12-31,"[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hopes, delights...","[artist, crafting, music, box, hope, delight, ...",artist crafting music box hope delight least o...
4,TheOnion,dwaxe,theonion.com,homeowner trying smoke snakes accidentally bur...,0,1,2021-12-31,"[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snakes, accidentall...","[homeowner, trying, smoke, snake, accidentally...",homeowner trying smoke snake accidentally burn...


In [315]:
df_not_onion['clean_text'] = df_not_onion['clean_text'].apply(change_to_nan)

































































In [316]:
df_not_onion.dropna(subset=['clean_text'], how='all',inplace=True)
df_not_onion.head()

Unnamed: 0,subreddit,author,domain,title,num_comments,score,timestamp,tokens,tokens_1,lemma_words,clean_text
0,nottheonion,Taco_duck68,wral.com,man attempts pay car rap steals potato chip truck,0,1,2021-12-31,"[man, attempts, pay, car, rap, steals, potato,...","[man, attempts, pay, car, rap, steals, potato,...","[man, attempt, pay, car, rap, steal, potato, c...",man attempt pay car rap steal potato chip truck
1,nottheonion,BlackNingaa,bloodyelbow.com,former ufc fighter reveals past sex worker fun...,1,1,2021-12-31,"[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor...","[former, ufc, fighter, reveals, past, sex, wor...",former ufc fighter reveals past sex worker fun...
2,nottheonion,Lopsided_File_1642,facebook.com,log facebook,1,1,2021-12-31,"[log, facebook]","[log, facebook]","[log, facebook]",log facebook
3,nottheonion,SkinnyWhiteGirl19,theartnewspaper.com,mcdonald’s blocked building drivethrough romes...,0,1,2021-12-31,"[mcdonald’s, blocked, building, drivethrough, ...","[mcdonald, blocked, building, drivethrough, ro...","[mcdonald, blocked, building, drivethrough, ro...",mcdonald blocked building drivethrough rome an...
4,nottheonion,kids-cake-and-crazy,kjrh.com,legendary actress betty white dies new years e...,0,1,2021-12-31,"[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dies, new, ...","[legendary, actress, betty, white, dy, new, ye...",legendary actress betty white dy new year eve ...


In [317]:
# Convert the data to csv

df_onion.to_csv('processed_onion.csv', index=False)
df_not_onion.to_csv('processed_notonion.csv', index=False)