In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('smsspamcollection.tsv',sep='\t')

In [7]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [8]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [9]:
len(df)

5572

In [10]:
X = df['message']
y =  df['label']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [13]:
X_train.shape

(3900,)

In [14]:
y_train.shape

(3900,)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [16]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [17]:
text_clf.fit(X_train, y_train) 

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [18]:
from sklearn import metrics

In [19]:
predict = text_clf.predict(X_test)

In [20]:
metrics.confusion_matrix(y_test,predict)

array([[1445,    3],
       [  10,  214]], dtype=int64)

In [21]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predict), index=['ham','spam'], columns=['ham','spam'])

In [22]:
df

Unnamed: 0,ham,spam
ham,1445,3
spam,10,214


In [23]:
print(metrics.classification_report(y_test,predict))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [24]:
print(metrics.accuracy_score(y_test,predict))

0.9922248803827751


In [25]:
text_clf.predict(['Hurray!!!You are selected as a Winner.Text WON to 4455522'])


array(['spam'], dtype=object)

In [26]:
text_clf.predict(['Hii,How are you!!'])

array(['ham'], dtype=object)