In [1]:
"""
dataset:

https://www.kaggle.com/team-ai/spam-text-message-classification/version/1
"""

'\ndataset:\n\nhttps://www.kaggle.com/team-ai/spam-text-message-classification/version/1\n'

In [2]:
import pandas as pd
import re
from nltk import ngrams, download
from nltk.corpus import stopwords
from collections import Counter

download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("/content/SPAM text message 20170820 - Data.csv")
df.columns = ["category", "sentence"]

df.head(3)

Unnamed: 0,category,sentence
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
df["clean_sentence"] = [re.sub(r"[^\w\s]", "", s).lower() for s in df["sentence"]]
df.head(3)

Unnamed: 0,category,sentence,clean_sentence
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...


In [5]:
stop_words = set(stopwords.words("english"))

temp_col = []
for s in df["clean_sentence"]:
  temp = []
  for word in s.split():
    if word not in stop_words:
      temp.append(word)

  temp_col.append(temp)

df["stop_words_removed"] = temp_col

# removing empty rows
df = df[df["stop_words_removed"].notna()]

df.head(3)

Unnamed: 0,category,sentence,clean_sentence,stop_words_removed
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."


In [6]:
# getting n grams from single line
s = df["stop_words_removed"].iloc[0]
gram = ngrams(s, 5)

list(gram)

[('go', 'jurong', 'point', 'crazy', 'available'),
 ('jurong', 'point', 'crazy', 'available', 'bugis'),
 ('point', 'crazy', 'available', 'bugis', 'n'),
 ('crazy', 'available', 'bugis', 'n', 'great'),
 ('available', 'bugis', 'n', 'great', 'world'),
 ('bugis', 'n', 'great', 'world', 'la'),
 ('n', 'great', 'world', 'la', 'e'),
 ('great', 'world', 'la', 'e', 'buffet'),
 ('world', 'la', 'e', 'buffet', 'cine'),
 ('la', 'e', 'buffet', 'cine', 'got'),
 ('e', 'buffet', 'cine', 'got', 'amore'),
 ('buffet', 'cine', 'got', 'amore', 'wat')]

In [7]:
# getting all n grams (entire column)

# n gram size
n = 3

temp = []
for s in df["stop_words_removed"]:
  grams = []
  if len(s) >= n:
    grams = ngrams(s, n)

  for gram in list(grams):
    temp.append(gram)

Counter(temp).most_common()[:10]

[(('ill', 'call', 'later'), 42),
 (('sorry', 'ill', 'call'), 38),
 (('prize', 'guaranteed', 'call'), 21),
 (('u', 'wan', '2'), 21),
 (('happy', 'new', 'year'), 18),
 (('reply', 'call', '08000930705'), 14),
 (('land', 'line', 'claim'), 14),
 (('pls', 'send', 'message'), 13),
 (('private', '2003', 'account'), 13),
 (('2003', 'account', 'statement'), 13)]