In [1]:
import numpy as np
import pandas as pd

In [3]:
# import data frame
df = pd.read_csv('../Datasets/smsspamcollection.tsv', sep='\t')

In [4]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.tail()

Unnamed: 0,label,message,length,punct
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1
5571,ham,Rofl. Its true to its name,26,1


In [11]:
# check missing values
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [None]:
# check for empty strings

In [12]:
# check unique values in label
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

---

In [13]:
# SPLIT THE DATA (Train, Test)
# import train_test_split
from sklearn.model_selection import train_test_split

In [14]:
# we're not going to be using 'length' and 'punct' this time, but only the 'message'
# let's see if 'message' can predict 'labels'

X = df['message'] #--> indipendent variable
y = df['label'] #--> dependent variable

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
X_train.shape #--> holds 70% of data

(3733,)

In [17]:
X_test.shape #--> holds 30% of data

(1839,)

---

In [18]:
# FEATURE EXTRACTION (vectorization) 
# we perform FE only on TEXT
# text pre-processing, tokenize, filter our stop words are all included in CountVectorizer
# CV --> which build a dictionary of features and transforms document to feature vectors

In [18]:
# import CountVectorizer first
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
count_vect = CountVectorizer()

In [20]:
# create a Document Term Matrix

# this procedure is known as FIT TRANSFORM: two ways:

# 1 - fit the vectorizer to the data (build a vocabulary, count the number of words)
#count_vect.fit(X_train) #-->build vocab and count words
#X_train_counts = count_vect.transform(X_train) #--> transform the original text --> vector

# 2 - perform fit and transform together
dtm = count_vect.fit_transform(X_train) #--> this does the two steps above

In [21]:
# inspect dtm
dtm

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [26]:
dtm.shape

(3733, 7082)

In [26]:
# 7082 --> unique words, but most of them will be 0

In [1]:
# transform counts to frequencies with tf-idf
# use a tf-idf vectorizer
# train the classifier
# combine the pipeline

In [22]:
# import CountVectorizer second
from sklearn.feature_extraction.text import TfidfTransformer

In [23]:
tfidf_transformer = TfidfTransformer()

In [24]:
# pass our document term matrix into tf-idf
X_train_tfidf = tfidf_transformer.fit_transform(dtm)

In [25]:
X_train_tfidf.shape # same as dtm.shape!

(3733, 7082)

---

In [27]:
# we can even combine count vectorization and tfidf transformation by using TfidfVectorization

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorizer = TfidfVectorizer()

In [30]:
# this way we get count vectorization and tfidf transformaiton in one line of code
X_train_tfidf = vectorizer.fit_transform(X_train) #--> we need X_train here

In [31]:
# now that we've extracted features we can import the model

In [32]:
from sklearn.svm import LinearSVC

In [33]:
classifier = LinearSVC()

In [36]:
# train the model --> X_train and y_train
classifier.fit(X_train_tfidf, y_train) # --> here we need to pass in the X_train_tfidf not the X_train

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [37]:
# predict the model

# until now, only our training set has been vectorized --> we need to do the same at our test set
# we can't do that ---> classifier.predict(X_test)

# we can either go through the same process again (vectorization of tfidf and fit the model)
# or combine the steps above by using a pipeline

In [38]:
from sklearn.pipeline import Pipeline #--> it's a short way to vectorize tfidf and perfor our model

In [43]:
# Pipeline() will contain a list of tuples
pipeline_classifier = Pipeline([('tfidf',TfidfVectorizer()),('classifier', LinearSVC())])

In [44]:
# do it again
pipeline_classifier.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [45]:
# test the classifier and display the result

# use our pipeline to perform tf-idf and LinearSVC
predictions = pipeline_classifier.predict(X_test)

In [51]:
from sklearn.metrics import confusion_matrix, classification_report

In [47]:
# display the matrix
print(confusion_matrix(y_test, predictions))

[[1586    7]
 [  12  234]]


In [48]:
# classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [52]:
from sklearn import metrics

In [55]:
metrics.accuracy_score(y_test, predictions)

0.989668297988037

---

In [56]:
# predict on a new message

In [61]:
pipeline_classifier.predict(['This is going to be wild,man!'])

array(['ham'], dtype=object)

In [62]:
pipeline_classifier.predict(['You won a prize, call this number as fast as you can'])

array(['spam'], dtype=object)