In [2]:
%reset -f

In [3]:
#imports
import pandas as pd
import numpy as np
import nltk
import re
import os
from tqdm import tqdm, tqdm_pandas
#tqdm_pandas(tqdm())
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt 
import matplotlib as mpl

def setup_mpl():
    mpl.rcParams['font.family'] = 'Times New Roman'
    return
setup_mpl()

# Text preprocessing

In [4]:
# Preprocessing function

porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

# Keep numbers / Remove numbers / Substitute numbers with token


def preprocess(text):
    #Common english stop words
    stop_words = set(stopwords.words('english'))
    # Substituting urls with 
    url_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    text = re.sub(r'http\S+', '#URL#', str(text)) # Maybe change to single-character-symbol -> shingles
    #Tokenize using nltk
    word_tokens = word_tokenize(text.lower())
    #removing none letter characters and stop words
    filtered_sentence = [w for w in word_tokens if w not in stop_words and w.isalpha()]
    #Conduct stemming
    processed_text = [porter.stem(t) for t in filtered_sentence]
    return processed_text

### SMS spam: 

In [5]:
PATH = "../data/regular/spam.csv"
SMS = pd.read_csv(PATH)[['v1','v2']]
SMS = SMS.rename(columns={'v1': 'label', 'v2': 'text'})

label = SMS['label'].values
text = SMS['text'].values

In [6]:
SMS['tokens'] = [preprocess(x) for x in tqdm(SMS['text'])]
SMS['str_tokens'] = [' '.join(x) for x in tqdm(SMS['tokens'])]
# Removing rows of emtpty tokens
SMS = SMS[SMS['tokens'].astype(bool)]
binary_dict = {'ham': 0, 'spam': 1}
SMS['binary'] = [binary_dict[x] for x in tqdm(SMS['label'])]
SMS.to_csv('../data/clean/clean_spam.csv', index=False)


100%|██████████| 5572/5572 [00:02<00:00, 2104.88it/s]
100%|██████████| 5572/5572 [00:00<00:00, 1374502.26it/s]
100%|██████████| 5551/5551 [00:00<00:00, 2184516.94it/s]


### Email spam

In [7]:
PATH = "../data/regular/completeSpamAssassin.csv"
EMAIL = pd.read_csv(PATH, index_col = 'Unnamed: 0')

EMAIL.rename(columns = {'Label':'label', 'Body':'text'}, inplace = True)

EMAIL['label'] = EMAIL['label'].replace(0, 'ham')
EMAIL['label'] = EMAIL['label'].replace(1, 'spam')

# Removing 'empty' rows
EMAIL.text = np.where(EMAIL.text.isin(['empty']), np.nan, EMAIL.text)
EMAIL = EMAIL.dropna().reset_index().drop(columns=['index'])

In [8]:
EMAIL['tokens'] = [preprocess(x) for x in tqdm(EMAIL['text'])]
EMAIL['str_tokens'] = [' '.join(x) for x in tqdm(EMAIL['tokens'])]
EMAIL = EMAIL[EMAIL['tokens'].astype(bool)]
binary_dict = {'ham': 0, 'spam': 1}
EMAIL['binary'] = [binary_dict[x] for x in tqdm(EMAIL['label'])]
EMAIL.to_csv('../data/clean/clean_completeSpamAssassin.csv', index=False)

100%|██████████| 5512/5512 [00:28<00:00, 196.27it/s]
100%|██████████| 5512/5512 [00:00<00:00, 198809.87it/s]
100%|██████████| 5507/5507 [00:00<00:00, 1789158.18it/s]


In [9]:
EMAIL

Unnamed: 0,text,label,tokens,str_tokens,binary
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,spam,"[save, life, insur, spend, life, quot, save, e...",save life insur spend life quot save ensur fam...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam,"[fight, risk, cancer, url, slim, guarante, los...",fight risk cancer url slim guarante lose lb da...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam,"[fight, risk, cancer, url, slim, guarante, los...",fight risk cancer url slim guarante lose lb da...,1
3,##############################################...,spam,"[adult, club, offer, free, membership, instant...",adult club offer free membership instant acces...,1
4,I thought you might like these:\n1) Slim Down ...,spam,"[thought, might, like, slim, guarante, lose, l...",thought might like slim guarante lose lb day u...,1
...,...,...,...,...,...
5507,----------------------------------------------...,ham,"[isilo, tm, palm, os, pocket, pc, window, ente...",isilo tm palm os pocket pc window enter isilo ...,0
5508,"EFFector Vol. 15, No. 35 November ...",ham,"[effector, vol, novemb, ren, public, electron,...",effector vol novemb ren public electron fronti...,0
5509,\nWe have extended our Free seat sale until Th...,ham,"[extend, free, seat, sale, thursday, novemb, d...",extend free seat sale thursday novemb detail s...,0
5510,___ ___ ...,ham,"[insignific, matter, heavili, overemphasis, hu...",insignific matter heavili overemphasis hugh mt...,0


## Job postings

In [10]:
PATH = "../data/regular/Job_postings.csv"
JOB = pd.read_csv(PATH, usecols=['company_profile', 'description', 'fraudulent'])

JOB["text"] = JOB["company_profile"] + " " + JOB["description"]

JOB.rename(columns = {'fraudulent':'label'}, inplace = True)

JOB['label'] = JOB['label'].replace(0, 'fraudulent')
JOB['label'] = JOB['label'].replace(1, 'genuine')

JOB = JOB.drop(columns=['company_profile', 'description'])
JOB = JOB.dropna()

JOB
# Removing 'empty' rows
#EMAIL.Body = np.where(EMAIL.Body.isin(['empty']), np.nan, EMAIL.Body)
#EMAIL = EMAIL.dropna().reset_index().drop(columns=['index'])

Unnamed: 0,label,text
0,fraudulent,"We're Food52, and we've created a groundbreaki..."
1,fraudulent,"90 Seconds, the worlds Cloud Video Production ..."
2,fraudulent,Valor Services provides Workforce Solutions th...
3,fraudulent,Our passion for improving quality of life thro...
4,fraudulent,SpotSource Solutions LLC is a Global Human Cap...
...,...,...
17872,fraudulent,Flite delivers ad innovation at scale to the w...
17875,fraudulent,Vend is looking for some awesome new talent to...
17876,fraudulent,WebLinc is the e-commerce platform and service...
17877,fraudulent,We Provide Full Time Permanent Positions for m...


In [None]:
JOB['tokens'] = [preprocess(x) for x in tqdm(JOB['text'])]
JOB['str_tokens'] = [' '.join(x) for x in tqdm(JOB['tokens'])]
JOB = JOB[JOB['tokens'].astype(bool)]
binary_dict = {'genuine': 0, 'fraudulent': 1}
JOB['binary'] = [binary_dict[x] for x in tqdm(JOB['label'])]
JOB.to_csv('../data/clean/clean_Job_postings.csv', index=False)

 67%|██████▋   | 9777/14572 [01:00<00:35, 136.09it/s]

In [None]:
JOB

## NEWS

In [None]:
PATH = "../data/regular/NEWS.csv"
NEWS = pd.read_csv(PATH, usecols=['text', 'label'])

NEWS['tokens'] = [preprocess(x) for x in tqdm(NEWS['text'])]
NEWS['str_tokens'] = [' '.join(x) for x in tqdm(NEWS['tokens'])]
NEWS = NEWS[NEWS['tokens'].astype(bool)]
binary_dict = {'real': 0, 'fake': 1}
NEWS['binary'] = [binary_dict[x] for x in tqdm(NEWS['label'])]
NEWS.to_csv('../data/clean/clean_News.csv', index=False)