In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv(r"C:\Users\kanch\Downloads\spam_email.csv")

In [6]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.fillna('', inplace=True)

In [8]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [9]:
df.loc[df['label'] == 'spam', 'label'] = 1
df.loc[df['label'] == 'ham', 'label'] = 0

In [10]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.shape    


(5572, 2)

In [12]:
X = df['text']
y = df['label']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (4457,)
Test set size: (1115,)


In [15]:
feature_extractor = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extractor.fit_transform(X_train)
X_test_features = feature_extractor.transform(X_test)

In [16]:
X_train_features.shape, X_test_features.shape

((4457, 7472), (1115, 7472))

In [17]:
X_train

1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: text, Length: 4457, dtype: object

In [18]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34794 stored elements and shape (4457, 7472)>
  Coords	Values
  (0, 1371)	0.4658046386365619
  (0, 4416)	0.4528381701109944
  (0, 1706)	0.3431839629173582
  (0, 7415)	0.348722265231364
  (0, 3210)	0.348722265231364
  (0, 4520)	0.4658046386365619
  (1, 1187)	0.26161139982801973
  (1, 3140)	0.3059746053542906
  (1, 3631)	0.2804339696184593
  (1, 6296)	0.269833648032668
  (1, 1533)	0.2015782058421696
  (1, 2661)	0.3059746053542906
  (1, 5005)	0.1937920260229529
  (1, 4292)	0.2953742837684993
  (1, 419)	0.28715203556385105
  (1, 4533)	0.3059746053542906
  (1, 6440)	0.2953742837684993
  (1, 1649)	0.3059746053542906
  (1, 0)	0.2654936554684193
  (2, 3061)	0.44585171875646595
  (2, 5770)	0.3962151014046925
  (2, 1674)	0.35156722029872034
  (2, 5351)	0.5102109014477275
  (2, 2190)	0.5102109014477275
  (3, 5048)	0.4444794309161828
  :	:
  (4451, 5002)	0.36484607066812064
  (4451, 4939)	0.36484607066812064
  (4452, 3085)	0.25923599228

In [19]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X_train_features, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [23]:
predictions_on_training = model.predict(X_train_features)
accuracy_on_training = accuracy_score(y_train, predictions_on_training)

In [24]:
print("Accuracy on training: ", accuracy_on_training)

Accuracy on training:  0.9692618353152345


In [25]:
predictions_on_testing = model.predict(X_test_features)
accuracy_on_testing = accuracy_score(y_test, predictions_on_testing)

In [26]:
print("Accuracy on testing: ", accuracy_on_testing)

Accuracy on testing:  0.9524663677130045


In [27]:
my_input = "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize."
input_features = feature_extractor.transform([my_input])    
prediction = model.predict(input_features)
print("Prediction (1 for spam, 0 for ham):", prediction[0])

Prediction (1 for spam, 0 for ham): 1
