In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer 
import re

In [3]:
from nltk.stem import WordNetLemmatizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
messages = pd.read_csv('spam.csv', encoding='ISO-8859-1')

## Data Cleaning

In [7]:
messages = messages.iloc[:,:2]
messages.columns = ["label", "message"]

In [8]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Data Preprocessing


In [9]:
ps = PorterStemmer()
sb = SnowballStemmer('english')
lem = WordNetLemmatizer()

In [10]:
stopwords = set(stopwords.words('english'))

### Porter Stemming


In [11]:
corpus_porter = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords]
    review = " ".join(review)
    corpus_porter.append(review)
    

### SnowBall Stemming


In [12]:
corpus_snow = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [sb.stem(word) for word in review if word not in stopwords]
    review = " ".join(review)
    corpus_snow.append(review)
    

### Lemmatizing


In [13]:
corpus_lem = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lem.lemmatize(word) for word in review if word not in stopwords]
    review = " ".join(review)
    corpus_lem.append(review)

## BOW


In [14]:
bow = CountVectorizer(max_features=2500)

# for binary
# bow = CountVectorizer(max_features=2500, binary=True)

In [15]:
X_lem = bow.fit_transform(corpus_lem).toarray()
X_porter = bow.fit_transform(corpus_porter).toarray()
X_snow = bow.fit_transform(corpus_snow).toarray()

In [16]:
X_lem.shape, X_porter.shape, X_snow.shape

((5572, 2500), (5572, 2500), (5572, 2500))

In [17]:
bow.vocabulary_

{'go': 804,
 'point': 1617,
 'crazi': 448,
 'avail': 141,
 'bugi': 273,
 'great': 828,
 'world': 2439,
 'la': 1114,
 'cine': 367,
 'got': 819,
 'wat': 2369,
 'ok': 1513,
 'lar': 1122,
 'joke': 1060,
 'wif': 2406,
 'oni': 1521,
 'free': 743,
 'entri': 613,
 'wkli': 2427,
 'comp': 402,
 'win': 2410,
 'fa': 656,
 'cup': 462,
 'final': 699,
 'tkts': 2213,
 'st': 2053,
 'may': 1293,
 'text': 2169,
 'receiv': 1725,
 'question': 1689,
 'std': 2064,
 'txt': 2270,
 'rate': 1706,
 'appli': 99,
 'dun': 575,
 'say': 1832,
 'earli': 579,
 'alreadi': 67,
 'nah': 1433,
 'think': 2184,
 'goe': 807,
 'usf': 2314,
 'live': 1184,
 'around': 114,
 'though': 2190,
 'freemsg': 745,
 'hey': 884,
 'darl': 481,
 'week': 2384,
 'word': 2436,
 'back': 154,
 'like': 1170,
 'fun': 764,
 'still': 2066,
 'tb': 2144,
 'xxx': 2467,
 'send': 1870,
 'rcv': 1709,
 'even': 627,
 'brother': 263,
 'speak': 2030,
 'treat': 2248,
 'per': 1574,
 'request': 1758,
 'mell': 1311,
 'oru': 1539,
 'minnaminungint': 1341,
 'nurungu':

## TF-IDF


In [18]:
tf_idf = TfidfVectorizer(max_features=2500)

In [19]:
X_tfidf = tf_idf.fit_transform(corpus_lem).toarray()

In [20]:
X_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
import numpy as np
np.set_printoptions(edgeitems=30,linewidth=100000,
                   formatter=dict(float=lambda x : "%.3g" % x))
    

In [25]:
X_tfidf

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

## NGRAM



In [26]:
tf_idf_ngram = TfidfVectorizer(max_features=2500, ngram_range=(2,2))

In [27]:
X_tfidf_ngram = tf_idf_ngram.fit_transform(corpus_lem).toarray()

In [29]:
tf_idf_ngram.vocabulary_

{'ok lar': 1474,
 'free entry': 650,
 'entry wkly': 580,
 'wkly comp': 2431,
 'cup final': 403,
 'tkts st': 2109,
 'std txt': 1938,
 'txt rate': 2210,
 'rate apply': 1703,
 'think go': 2060,
 'freemsg hey': 670,
 'like fun': 1149,
 'treat like': 2176,
 'per request': 1552,
 'request melle': 1759,
 'melle melle': 1275,
 'melle oru': 1276,
 'oru minnaminunginte': 1507,
 'minnaminunginte nurungu': 1305,
 'nurungu vettam': 1454,
 'vettam set': 2300,
 'set callertune': 1842,
 'callertune caller': 194,
 'caller press': 192,
 'press copy': 1671,
 'copy friend': 371,
 'friend callertune': 673,
 'winner valued': 2418,
 'valued network': 2297,
 'network customer': 1408,
 'claim call': 245,
 'call claim': 151,
 'claim code': 246,
 'code kl': 282,
 'kl valid': 1036,
 'valid hour': 2294,
 'entitled update': 577,
 'update latest': 2235,
 'latest colour': 1097,
 'colour mobile': 302,
 'free call': 645,
 'call mobile': 172,
 'mobile update': 1334,
 'update co': 2234,
 'co free': 270,
 'want talk': 234