In [31]:
import re
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
def load_data(path):

    data = pd.read_csv(path)
    data.columns = data.columns.str.lower()
    return data

def merge_data(ling_data, evron_data):

    evron_data['label'] = (evron_data['category'] == 'spam').astype('int')
    evron_data = evron_data[['message', 'label']]

    ling_data = ling_data[['message', 'label']]
    data = pd.concat([evron_data, ling_data])
    return data

def filter_data(data):

    data_ham = data[data['label'] == 0]
    data_spam = data[data['label'] == 1]
    quater = data_ham.shape[0]//4

    data_ham = data_ham.iloc[:quater]
    data = pd.concat([data_ham, data_spam])
    data = data.sample(frac= 1)
    data = data.reset_index(drop =True)
    return data

def remove_stopwords_alphanum(messages):
    new_message = []

    for message in messages:
        message = []
        for sent in message:
            words = word_tokenize(sent)
            words = re.sub(r'[^a-zA-Z0-9\s]', '', words)
            filtered_words = [word for word in words if word not in stopwords.words('english')]
            filtered_sent  = ''.join(filtered_words)
            message.append(filtered_sent)
        new_message.append(message)
    return new_message

def stem_lemmatize(messages):

    stem = PorterStemmer()
    lem = WordNetLemmatizer()

    new_message = []

    for message in messages:
        message = []
        for sent in message:
            words = word_tokenize(sent)
            words = [lem.lemmatize(stem()) for word in words]


def preprocess(data):

    data['message'] = data['message'].str.lower()

    labels = data['label'].tolist()
    messages = data['message'].tolist()
    messages = [sent_tokenize(message) for message in messages]



In [28]:
ling_path = './data/Ling-spam/messages.csv'
evron_path = './data/evron/mail_data.csv'
ling_data = load_data(ling_path)
evron_data = load_data(evron_path)
data = merge_data(ling_data, evron_data)
data = filter_data(data)

In [29]:
data.head()

Unnamed: 0,message,label
0,all you do is advertise this 800 number ! that...,1
1,If he started searching he will get job in few...,0
2,Do u ever get a song stuck in your head for no...,0
3,Auction round 4. The highest bid is now £54. N...,1
4,FREE for 1st week! No1 Nokia tone 4 ur mob eve...,1


In [23]:
data['label'].value_counts()

label
0    1809
1    1228
Name: count, dtype: int64

In [6]:
evron_data.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
