In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('smsspamcollection.tsv',sep='\t')

In [5]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [6]:
df.isna().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [8]:
from sklearn.model_selection import train_test_split

X = df[['length','punct']]

y = df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [10]:
X_train.shape

(4457, 2)

In [11]:
X_test.shape

(1115, 2)

In [13]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [14]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()

In [15]:
lr_model.fit(X_train,y_train)

LogisticRegression()

In [24]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [17]:
predictions = lr_model.predict(X_test)

In [18]:
predictions

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [21]:
print(confusion_matrix(y_test,predictions))

[[927  35]
 [147   6]]


In [23]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.86      0.96      0.91       962
        spam       0.15      0.04      0.06       153

    accuracy                           0.84      1115
   macro avg       0.50      0.50      0.49      1115
weighted avg       0.76      0.84      0.79      1115



In [25]:
print(accuracy_score(y_test,predictions))

0.8367713004484305


### # Considering X as message

In [27]:
X = df['message']

y = df['label']

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [31]:
count_vect = CountVectorizer()

In [32]:
x_train_counts = count_vect.fit_transform(X_train)

In [33]:
x_train_counts

<4179x7509 sparse matrix of type '<class 'numpy.int64'>'
	with 55283 stored elements in Compressed Sparse Row format>

In [34]:
X_train.shape

(4179,)

In [35]:
x_train_counts.shape

(4179, 7509)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tfidf_transform = TfidfVectorizer()

In [44]:
X_train_tfidf = tfidf_transform.fit_transform(X_train)

In [45]:
from sklearn.svm import LinearSVC

In [46]:
clf = LinearSVC()

In [47]:
clf.fit(X_train_tfidf,y_train)

LinearSVC()

In [48]:
from sklearn.pipeline import Pipeline

In [49]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [50]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [51]:
predictions = text_clf.predict(X_test)

In [52]:
print(confusion_matrix(y_test,predictions))

[[1196    3]
 [  17  177]]


In [53]:
print(accuracy_score(y_test,predictions))

0.9856424982053122


In [57]:
text_clf.predict(['Hi there'])

array(['ham'], dtype=object)