### DataSet Overview

In [34]:
import numpy as np
import pandas as pd

In [35]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [36]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [37]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [38]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
# Notice here we only use Text Data :)
X = df['message']
y = df['label']

### 2 Steps - Count Vectorization & Tfidf Transformer

#### Count Vectorization on the Data

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
count_vect = CountVectorizer()

In [44]:
# STEPS
# 1. FIT VECTORIZER TO THE DATA (build a vocab, count the number of words...)
# count_vect.fit(X_train)
# 2. TRANSFORM THE ORIGINAL TEXT MESSAGE --> VECTOR
# X_train_counts = count_vect.transform(X_train)

# Fit and Transform in one code
X_train_counts = count_vect.fit_transform(X_train)

In [45]:
X_train_counts
# Here as you can see we have sparse matrix
# from 3733 Messages we have 7082 unique words

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [46]:
X_train.shape

(3733,)

In [47]:
X_train_counts.shape

(3733, 7082)

#### Tfidf Transformer

In [48]:
from sklearn.feature_extraction.text import TfidfTransformer

In [49]:
tfidf_transformer = TfidfTransformer()

In [50]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [51]:
X_train_tfidf.shape
# Here the shape is the same, however it is no longer counts instead its taken in the term frequency and multiply it by inverse document frequency 

(3733, 7082)

### TfidfVectorizer - Combining Count Vectorization and Tfidf Transformer (the same 2 step as above but combined into 1 ðŸ™ƒ)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
vectorizer = TfidfVectorizer()

In [54]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [55]:
X_train_tfidf.shape

(3733, 7082)

### Training A Classifier

In [56]:
from sklearn.svm import LinearSVC

In [57]:
clf = LinearSVC()

In [58]:
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Only our Training Set has been Vectorize into a full vocabulary. In order to perform an analysison our test set, we would actually have to then repeat all these same procedures.
This can get a bit tiresome especially if you have a long processes, therefore Scikit-learn provides a pipelince class

### Creating the Pipelince

In [59]:
from sklearn.pipeline import Pipeline

In [60]:
# name = Pipelince([('1st name', 1st function), (2nd name, 2nd function)])

# Here it will perform Vectorization then Classfication in one Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [61]:
# Using X_train & y_train from the 'Original' dataset

text_clf.fit(X_train,y_train)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

#### Using the Pipelince

In [62]:
# The X_test are the original / raw X_test data (not yet vectorized and classified)
# X_test

In [63]:
predictions = text_clf.predict(X_test)

In [64]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [65]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [32]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [33]:
from sklearn import metrics

metrics.accuracy_score(y_test, predictions)

0.989668297988037

### Predict on Our Own Data

In [66]:
text_clf.predict(["Hi how are you doing today?"])

array(['ham'], dtype=object)

In [67]:
text_clf.predict(["Congratulations! You've been selected as a winner. TEXT WON to 44255 congratulations free entry to contest."])

array(['spam'], dtype=object)