In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data collectinga & preprocessing

In [5]:
# load data
raw_mail_data = pd.read_csv("/content/mail_data.csv")

In [6]:
# replace null value with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), "")

In [7]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#checking number of row and col
mail_data.shape

(5572, 2)

# Label encoding
- ham = 1
- spam = 0

In [9]:
mail_data.loc[mail_data["Category"] == "spam", "Category",] = 0
mail_data.loc[mail_data["Category"] == "ham", "Category",] = 1

In [10]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


# Separating data as text and label

In [11]:
X = mail_data["Message"]
y = mail_data["Category"]

In [12]:
X, y

(0       Go until jurong point, crazy.. Available only ...
 1                           Ok lar... Joking wif u oni...
 2       Free entry in 2 a wkly comp to win FA Cup fina...
 3       U dun say so early hor... U c already then say...
 4       Nah I don't think he goes to usf, he lives aro...
                               ...                        
 5567    This is the 2nd time we have tried 2 contact u...
 5568                 Will ü b going to esplanade fr home?
 5569    Pity, * was in mood for that. So...any other s...
 5570    The guy did some bitching but I acted like i'd...
 5571                           Rofl. Its true to its name
 Name: Message, Length: 5572, dtype: object,
 0       1
 1       1
 2       0
 3       1
 4       1
        ..
 5567    0
 5568    1
 5569    1
 5570    1
 5571    1
 Name: Category, Length: 5572, dtype: object)

# Train test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [14]:
X.shape, X_train.shape, X_test.shape

((5572,), (4457,), (1115,))

# Feature Extraction
transform text data to feature vectors that can be used as input to LogisticRegression

In [32]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

In [33]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [19]:
# convert y values as integers
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [21]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [34]:
X_train_features.shape, X_test_features.shape

((4457, 7431), (1115, 7431))

# Train the model

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train_features, y_train)

# Evaluation

In [35]:
# training data
train_pred = model.predict(X_train_features)
train_score = accuracy_score(train_pred, y_train)
train_score

0.9670181736594121

In [36]:
# testing data
test_pred = model.predict(X_test_features)
test_score = accuracy_score(test_pred, y_test)
test_score

0.9659192825112107

# Building Predictive System

In [39]:
input = ["Em tên là Trần Bách Giang, MSSV 20202743, là sinh viên lớp Mô hình hóa EE4430E 147172 sáng thứ 6 ạ. Thưa thầy, sáng thứ 6 tuần trước nghĩ lễ (29/12) em có nghỉ 1 buổi học của thầy và bị điểm danh vắng 1 buổi không phép. Lý do là vì em bị sốt cao chưa rõ lý do. Đến ngày thứ 7 (30/12) em mới đi khám và bị chuẩn đoán cúm A. Em có đính kèm ảnh phiếu khám bệnh và kết quả trong mail này để làm minh chứng. Em viết mail này mong thầy sửa lại buổi vắng không phép của em thành vắng có phép để không ảnh hưởng đến việc đánh giá điểm quá trình ạ."]
input_feature = feature_extraction.transform(input)
pred = model.predict(input_feature)
print(pred)

[1]
