In [23]:
import pandas as pd
import numpy as np
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [5]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [8]:
#check for null values
df.isnull().sum()
#in this case everything is zero so we're not missing any data

label      0
message    0
length     0
punct      0
dtype: int64

In [9]:
#How many rows in the data set?
len(df)

5572

In [13]:
#check unique values of the column
df['label'].unique()
#There are two Unique Values

array(['ham', 'spam'], dtype=object)

In [15]:
#How many of each unique value we have
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# Feature Extraction from Text


In [25]:
from sklearn.model_selection import train_test_split

In [40]:
X=df['message']
y=df['label']
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=42)

In [41]:
#Count Vectorization-->Text pre-processing, tokenization and filter out stop words are included in Count Vectorization
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [38]:
#if we look into X, it's still in raw text format
X

In [35]:
#Will pass X into Count_vect to transform 

In [43]:
#Approach 1
#So we will fit the vectorizer in data-->Does things-build a vocab, count number of words..)
        # count_vect.fit(X_train)
#Transform the orignal text message-->Vector
        # X_train_counts = count_vect.transform(X_train)


#Approach 2 -->Do above steps in one line
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

# TD-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfTransformer

In [45]:
tfidf_transfromer=TfidfTransformer()

In [46]:
X_train_tfidf = tfidf_transfromer.fit_transform(X_train_counts) 

In [48]:
X_train_tfidf.shape

(3733, 7082)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
vectorizer=TfidfVectorizer()

In [52]:
X_train_tfidf=vectorizer.fit_transform(X_train)

# Train a classifier


In [56]:
from sklearn.svm import LinearSVC
clf=LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [57]:
#Only our training set has been vectorized but inorder to do the analysis /
#we would have to repeat all these same procedures and it can get tiring 

from sklearn.pipeline import Pipeline


In [60]:
text_clf= Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),])

In [61]:
# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

# Test the classifier and display results

In [62]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [63]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [64]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [65]:
#Checking simple accuracy
from sklearn import metrics


In [66]:
metrics.accuracy_score(y_test,predictions)

0.989668297988037

In [67]:
#checking the model on the new dataset
text_clf.predict(["Hi how are you doing?"])

array(['ham'], dtype=object)

In [68]:
text_clf.predict(["Congratulations, you have won $100"])

array(['spam'], dtype=object)