## Data prep

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

spam_data = pd.read_csv('data/spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

In [3]:
X_train

872                       I'll text you when I drop x off
831     Hi mate its RV did u hav a nice hol just a mes...
1273    network operator. The service is free. For T &...
3314    FREE MESSAGE Activate your 500 FREE Text Messa...
4929    Hi, the SEXYCHAT girls are waiting for you to ...
                              ...                        
4931                Match started.india  &lt;#&gt;  for 2
3264    44 7732584351, Do you want a New Nokia 3510i c...
1653    I was at bugis juz now wat... But now i'm walk...
2607    :-) yeah! Lol. Luckily i didn't have a starrin...
2732    How dare you stupid. I wont tell anything to y...
Name: text, Length: 4179, dtype: object

## Questions

### Question 1: What percentage of the documents in spam_data are spam?

In [4]:
def answer_one():
    return len(spam_data[spam_data["target"] == 1]) / len(spam_data) * 100

answer_one()

13.406317300789663

### Question 2: Fit the training data X_train using a Count Vectorizer with default parameters.

What is the longest token in the vocabulary?

In [5]:
def answer_two():
    from sklearn.feature_extraction.text import CountVectorizer

    vect = CountVectorizer().fit(X_train)

    return max(vect.get_feature_names(), key = len)

answer_two()

'com1win150ppmx3age16subscription'

### Question 3: Fit and transform the training data X_train using a Count Vectorizer with default parameters.

Next, fit a fit a multinomial Naive Bayes classifier model with smoothing alpha=0.1. Find the area under the curve (AUC) score using the transformed test data.



In [35]:
def answer_three():
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import roc_auc_score

    vect = CountVectorizer().fit(X_train)
    X_train_vectorized = vect.transform(X_train).toarray()

    model = MultinomialNB(alpha = 0.1)
    model.fit(X_train_vectorized, y_train)

    predictions = model.predict(vect.transform(X_test).toarray())

    return roc_auc_score(y_test, predictions)

answer_three()

0.9720812182741116

### Question 4: Fit and transform the training data X_train using a Tfidf Vectorizer with default parameters.

What 20 features have the smallest tf-idf and what 20 have the largest tf-idf?

Put these features in a two series where each series is sorted by tf-idf value and then alphabetically by feature name. The index of the series should be the feature name, and the data should be the tf-idf.

The series of 20 features with smallest tf-idfs should be sorted smallest tfidf first, the list of 20 features with largest tf-idfs should be sorted largest first.

In [38]:
def answer_four():
    from sklearn.feature_extraction.text import TfidfVectorizer

    vect = TfidfVectorizer()
    X_train_vectorized = vect.fit_transform(X_train)

    feature_names = np.array(vect.get_feature_names())
    sorted_coef = X_train_vectorized.max(0).toarray()[0]
    sorted_coef_index = sorted_coef.argsort()

    #s = pd.Series(sorted_coef, index = feature_names).sort_values()
    df = pd.DataFrame([feature_names, sorted_coef]).T
    df = df.sort_values(by = [1, 0])
    df.set_index(0, inplace = True)

    return df.iloc[:20, 0], df.iloc[:-21:-1, 0]

answer_four()

(0
 aaniye          0.074475
 athletic        0.074475
 chef            0.074475
 companion       0.074475
 courageous      0.074475
 dependable      0.074475
 determined      0.074475
 exterminator    0.074475
 healer          0.074475
 listener        0.074475
 organizer       0.074475
 pest            0.074475
 psychiatrist    0.074475
 psychologist    0.074475
 pudunga         0.074475
 stylist         0.074475
 sympathetic     0.074475
 venaam          0.074475
 afternoons       0.09125
 approaching      0.09125
 Name: 1, dtype: object,
 0
 yup               1.0
 where             1.0
 too               1.0
 thanx             1.0
 thank             1.0
 okie              1.0
 ok                1.0
 nite              1.0
 lei               1.0
 home              1.0
 havent            1.0
 er                1.0
 done              1.0
 beerage           1.0
 anytime           1.0
 anything          1.0
 645               1.0
 146tf150p         1.0
 tick         0.980166
 blank      

### Question 5: Fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 3.

Then fit a multinomial Naive Bayes classifier model with smoothing alpha=0.1 and compute the area under the curve (AUC) score using the transformed test data.

In [8]:
def answer_five():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import roc_auc_score

    vect = TfidfVectorizer(min_df = 3)
    X_train_vectorized = vect.fit_transform(X_train)

    model = MultinomialNB(alpha = 0.1)
    model.fit(X_train_vectorized, y_train)

    predictions = model.predict(vect.transform(X_test))
    
    return roc_auc_score(y_test, predictions)

answer_five()

0.9416243654822335

### What is the average length of documents (number of characters) for not spam and spam documents?

In [9]:
def answer_six():
    spam_data["len"] = spam_data["text"].apply(lambda x: len(x))
    text_lenghts = spam_data.groupby("target").agg({"len": np.average})
    return text_lenghts.iloc[0, 0], text_lenghts.iloc[1, 0]

answer_six()

(71.02362694300518, 138.8661311914324)

### Extra helper

In [10]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

### Question 7: Fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 5.

Using this document-term matrix and an additional feature, the length of document (number of characters), fit a Support Vector Classification model with regularization C=10000. Then compute the area under the curve (AUC) score using the transformed test data.



In [11]:
def answer_seven():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.svm import SVC
    from sklearn.metrics import roc_auc_score

    vect = TfidfVectorizer(min_df = 5)
    X_train_vectorized = vect.fit_transform(X_train)
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len(x)))

    model = SVC(C = 10000)
    model.fit(X_train_vectorized, y_train)

    X_test_vectorized = vect.transform(X_test)
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len(x)))
    predictions = model.predict(X_test_vectorized)

    return roc_auc_score(y_test, predictions)

answer_seven()

0.9661689557407943

### Question 8: What is the average number of digits per document for not spam and spam documents?

In [12]:
def answer_eight():
    spam_data["digits"] = spam_data["text"].apply(lambda x: len("".join(d for d in x if d.isdigit())))
    digits = spam_data.groupby("target").agg({"digits": np.average})
    return digits.iloc[0, 0], digits.iloc[1, 0]

answer_eight()

(0.2992746113989637, 15.759036144578314)

### Question 9: Fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 5 and using word n-grams from n=1 to n=3 (unigrams, bigrams, and trigrams).

Using this document-term matrix and the following additional features:

the length of document (number of characters)
number of digits per document
fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

In [39]:
def answer_nine():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import roc_auc_score

    vect = TfidfVectorizer(min_df = 5, ngram_range = (1, 3))
    X_train_vectorized = vect.fit_transform(X_train)
    # Adding length of texts
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len(x)))
    # Adding number of digits
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len("".join(d for d in x if d.isdigit()))))

    model = LogisticRegression(C = 100)
    model.fit(X_train_vectorized, y_train)

    X_test_vectorized = vect.transform(X_test)
    # Adding length of texts
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len(x)))
    # Adding number of digits
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len("".join(d for d in x if d.isdigit()))))

    predictions = model.predict(X_test_vectorized)
    return roc_auc_score(y_test, predictions)

answer_nine()

0.9759031798040846

### Question 10: What is the average number of non-word characters (anything other than a letter, digit or underscore) per document for not spam and spam documents?

Hint: Use \w and \W character classes

In [14]:
def answer_ten():
    import re
    spam_data["non_word"] = spam_data["text"].apply(lambda x: len(re.findall(r"\W", x)))
    non_word = spam_data.groupby("target").agg({"non_word": np.average})

    return non_word.iloc[0, 0], non_word.iloc[1, 0]

answer_ten()

(17.29181347150259, 29.041499330655956)

### Question 11: Fit and transform the training data X_train using a Count Vectorizer ignoring terms that have a document frequency strictly lower than 5 and using character n-grams from n=2 to n=5.

To tell Count Vectorizer to use character n-grams pass in analyzer='char_wb' which creates character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

Using this document-term matrix and the following additional features:

the length of document (number of characters)
number of digits per document
number of non-word characters (anything other than a letter, digit or underscore.)
fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

Also find the 10 smallest and 10 largest coefficients from the model and return them along with the AUC score in a tuple.

The list of 10 smallest coefficients should be sorted smallest first, the list of 10 largest coefficients should be sorted largest first.

The three features that were added to the document term matrix should have the following names should they appear in the list of coefficients: ['length_of_doc', 'digit_count', 'non_word_char_count']

In [34]:
def answer_eleven():
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import roc_auc_score
    import re

    ######## X_train vectorization
    vect = CountVectorizer(min_df = 5, ngram_range = (2, 5), analyzer = "char_wb")
    X_train_vectorized = vect.fit_transform(X_train).toarray()
    # Adding text length
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len(x)))
    # Adding number of digits
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len("".join(d for d in x if d.isdigit()))))
    # Adding number of non-word characters
    X_train_vectorized = add_feature(X_train_vectorized, X_train.apply(lambda x: len(re.findall(r"\W", x))))

    ######## Model
    model = LogisticRegression(C = 100)
    model.fit(X_train_vectorized, y_train)

    ######## X_test vectorization
    X_test_vectorized = vect.transform(X_test).toarray()
    # Adding text length
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len(x)))
    # Adding number of digits
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len("".join(d for d in x if d.isdigit()))))
    # Adding number of non-word characters
    X_test_vectorized = add_feature(X_test_vectorized, X_test.apply(lambda x: len(re.findall(r"\W", x))))

    ######## Predicitions
    predictions = model.predict(X_test_vectorized)

    ######## Smallest/largest coefficients
    # get the feature names as numpy array
    feature_names = np.array(vect.get_feature_names())

    # Sort the coefficients from the model
    sorted_coef_index = model.coef_[0][:-2].argsort()
    smallest = feature_names[sorted_coef_index[:10]]
    largest = feature_names[sorted_coef_index[:-11:-1]]

    return roc_auc_score(y_test, predictions), smallest, largest

answer_eleven()

(0.9809793219360643,
 array(['..', '. ', ' i', ' go', '? ', ' y', 'pe', 'ok', 'go', 'ca'],
       dtype='<U5'),
 array(['ww', 'co', 'ne', 'ia', 'ar', 'xt', 'mob', 'uk', ' ch', 'eply '],
       dtype='<U5'))