In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline

In [2]:
def text_classifier(vectorizer, classifier):
    return Pipeline([("vectorizer", vectorizer),("classifier", classifier)])

In [3]:
#1, 2
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['class', 'text'])
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#3
df.loc[df['class'] == 'ham', 'class'] = 0
df.loc[df['class'] == 'spam', 'class'] = 1
df['class'] = df['class'].astype(int)

In [5]:
print(df.groupby(by=['class'])['text'].count())

class
0    4825
1     747
Name: text, dtype: int64


In [6]:
#4
cnt_vectorizer = CountVectorizer()
X = cnt_vectorizer.fit_transform(df['text'])
y = df['class'].values
print(X.shape)

(5572, 8713)


In [7]:
logreg = LogisticRegression(random_state=2, solver='lbfgs')

In [8]:
#5 should be like this:
print(cross_val_score(text_classifier(CountVectorizer(), logreg), df['text'], y, cv=10, n_jobs=-1).mean())
# but to pass the task it have to be like this:
print(cross_val_score(logreg, X, y, scoring="f1", cv=10, n_jobs=-1).mean())

0.9825907143631246
0.9311269283144492


In [9]:
#6
test = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB"
        , "FreeMsg: Txt: claim your reward of 3 hours talk time"
        , "Have you visited the last lecture on physics?"
        , "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
        , "Only 99$"]

In [10]:
logreg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=2, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
X_test = cnt_vectorizer.transform(test)
print(' '.join(logreg.predict(X_test).astype(str)))

1 1 0 0 0


In [12]:
#7, 8
l = [(2, 2), (3, 3), (1,3)]
for ll in l:
    cv_score1 = cross_val_score(text_classifier(CountVectorizer(ngram_range=ll), logreg),
                          df['text'], y, scoring="f1", cv=10, n_jobs=-1).mean()
    cv_score2 = cross_val_score(text_classifier(CountVectorizer(ngram_range=ll), MultinomialNB()),
                          df['text'], y, scoring="f1", cv=10, n_jobs=-1).mean()
    print(f"Logreg, {ll}: {cv_score1}")
    print(f"MultinomialNB, {ll}: {cv_score2}")

Logreg, (2, 2): 0.816782323945987
MultinomialNB, (2, 2): 0.9330215115853413
Logreg, (3, 3): 0.7250161555467377
MultinomialNB, (3, 3): 0.871265305963816
Logreg, (1, 3): 0.9216311884303947
MultinomialNB, (1, 3): 0.9472991994136064


In [13]:
#8 like in 5 (to pass it have to be like this)
for ll in l:
    X = CountVectorizer(ngram_range=ll).fit_transform(df['text'])
    cv_score1 = cross_val_score(logreg, X, y, scoring="f1", cv=10, n_jobs=-1).mean()
    cv_score2 = cross_val_score(MultinomialNB(), X, y, scoring="f1", cv=10, n_jobs=-1).mean()
    print(f"Logreg, {ll}: {cv_score1}")
    print(f"MultinomialNB, {ll}: {cv_score2}")

Logreg, (2, 2): 0.816782323945987
MultinomialNB, (2, 2): 0.6455015177985443
Logreg, (3, 3): 0.7250161555467377
MultinomialNB, (3, 3): 0.37871948524573595
Logreg, (1, 3): 0.9216311884303947
MultinomialNB, (1, 3): 0.8884859656061002


In [14]:
#9 like in 5
print(cross_val_score(text_classifier(TfidfVectorizer(), logreg), df['text'], y, cv=10, n_jobs=-1).mean())

X = TfidfVectorizer().fit_transform(df['text'])
print(cross_val_score(logreg, X, y, cv=10, n_jobs=-1).mean())

0.9707463275918908
0.9650025390008258
