In [1]:
import pandas as pd
import string


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data=pd.read_csv('/content/drive/My Drive/spamdata.csv')

In [4]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data['label'].value_counts(normalize=True)

label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [6]:
#pre processing the dataset
cleaned=data['text'][0].lower()

In [7]:
cleaned

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [8]:
punctuations=string.punctuation

In [9]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
cleaned="".join(character for character in cleaned if character not in punctuations)

In [12]:
cleaned

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [13]:
from spacy.lang.en import English

In [14]:
nlp=English()

In [15]:
my_doc=nlp(cleaned)

In [16]:
my_doc

go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat

In [17]:
#create list of word tokens
token_list=[]
for token in my_doc:
  token_list.append(token.text)

In [18]:
token_list

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [19]:
#import stop words
from spacy.lang.en.stop_words import STOP_WORDS

In [20]:
#create list of word tokens after removing stopwords
filtered_sentence=[]
#iterate over the tokens
for word in token_list:
  lexeme=nlp.vocab[word]
  #check if stopword or not
  if lexeme.is_stop==False:
    filtered_sentence.append(word)


print(token_list)
print(filtered_sentence)
cleaned=filtered_sentence

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [22]:
cleaned=" ".join(cleaned)

In [23]:
cleaned

'jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [24]:
# Preprocessing function
def clean_text(text):
    ## lower case
    cleaned = text.lower()

    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)

    ## remove stopwords
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    ## remove stop words
    filtered_sentence =[]

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)

    ## Store cleaned document
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)

    return cleaned

In [25]:
data["cleaned"]=data["text"].apply(lambda x:clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


# Feature Engineering and Model Building

In [28]:
#creating meta features
data["word_count"]=data["text"].apply(lambda x: len(x.split()))
data["word_count_cleaned"]=data["cleaned"].apply(lambda x: len(x.split()))
data["char_count"]=data["cleaned"].apply(lambda x: len(x))
data["char_count_without_spaces"]=data["cleaned"].apply(lambda x: len(x.replace(" ","")))
data["num_dig"]=data["cleaned"].apply(lambda x: sum([1 if w.isdigit() else 0 for w in x.split()]))

In [29]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


In [30]:
#counting noun and verbs

In [31]:
import spacy
nlp=spacy.load("en_core_web_sm")
document=nlp(data['cleaned'][0])

In [32]:
document

jurong point crazy available bugis n great world la e buffet cine got amore wat

In [33]:
#POS tags
all_tags=[]
for w in document:
  all_tags.append(w.tag_)

In [34]:
all_tags

['NNP',
 'VBP',
 'NNP',
 'JJ',
 'NNP',
 'CC',
 'JJ',
 'NN',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'VBD',
 'NNP',
 'NN']

In [None]:
iiiiiiiiii