In [22]:
import pandas as pd
import matplotlib.pyplot as plt
#NLP library 
import re
import unicodedata
import nltk
from wordcloud import WordCloud


%matplotlib inline
import matplotlib.pyplot as plt


from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.tokenize.toktok import ToktokTokenizer
from pprint import pprint
#My libraries 
import acquire as a 
from env import get_connection

#Beautiful Soup and Webscrape
from requests import get
from bs4 import BeautifulSoup


#Modeling 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



import warnings

# .a Load the spam dataset

In [2]:
url = get_connection('spam_db')

In [3]:
query = '''
        SELECT *
        FROM spam
        '''

In [12]:
df = pd.read_sql(query, url)
df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]
    

In [14]:
df['clean_text'] = df.text.apply(clean).apply(' '.join)


In [15]:
df.drop(columns='id').head()


Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [18]:
X = df.clean_text
y = df.label
X_train_val, X_test, y_train_val, y_test = \
train_test_split(X, y, 
                 test_size=0.7, 
                 random_state=142,
                 stratify=df.label)
X_train, X_validate, y_train, y_validate =\
train_test_split(X_train_val, y_train_val, 
                 test_size=0.7, 
                 random_state=142)

In [19]:
X_train.head()

3635    2 babe feel let 4get itboth try cheer upnot fi...
125                                            good stuff
1160                             sure neighbor didnt pick
3496    oh grand bit party doesnt mention cover charge...
2338                                      alright see bit
Name: clean_text, dtype: object

In [20]:
y_train.value_counts(normalize=True)


label
ham     0.878244
spam    0.121756
Name: proportion, dtype: float64

In [23]:
# Whatever transformations we apply to X_train need to be applied to X_test
for i in range(2,11):
    cv = CountVectorizer()
    X_bow = cv.fit_transform(X_train)
    tree = DecisionTreeClassifier(max_depth=i)
    tree.fit(X_bow, y_train)
    print(f'training accuracy score at max_depth={i}\
 is: {tree.score(X_bow, y_train)}')

    X_val_bow = cv.transform(X_validate)
    print(f'validation accuracy score at max_depth={i}\
 is: {tree.score(X_val_bow, y_validate)}')
    print()

training accuracy score at max_depth=2 is: 0.936127744510978
validation accuracy score at max_depth=2 is: 0.9025641025641026

training accuracy score at max_depth=3 is: 0.9520958083832335
validation accuracy score at max_depth=3 is: 0.9042735042735043

training accuracy score at max_depth=4 is: 0.9660678642714571
validation accuracy score at max_depth=4 is: 0.9196581196581196

training accuracy score at max_depth=5 is: 0.9740518962075848
validation accuracy score at max_depth=5 is: 0.9205128205128205

training accuracy score at max_depth=6 is: 0.9780439121756487
validation accuracy score at max_depth=6 is: 0.9188034188034188

training accuracy score at max_depth=7 is: 0.9840319361277445
validation accuracy score at max_depth=7 is: 0.9230769230769231

training accuracy score at max_depth=8 is: 0.9880239520958084
validation accuracy score at max_depth=8 is: 0.9162393162393162

training accuracy score at max_depth=9 is: 0.9920159680638723
validation accuracy score at max_depth=9 is: 0.927

In [25]:
X_test_bow = cv.transform(X_test)
print(f'test data accuracy score at max_depth={i} is: {tree.score(X_test_bow, y_test)}')


test data accuracy score at max_depth=10 is: 0.9207895411432966


In [26]:
# Whatever transformations we apply to X_train need to be applied to X_test
for i in range(2,11):
    tfidf = TfidfVectorizer()
    X_bow = tfidf.fit_transform(X_train)
    tree = DecisionTreeClassifier(max_depth=i)
    tree.fit(X_bow, y_train)
    print(f'training accuracy score at max_depth={i}\
 is: {tree.score(X_bow, y_train)}')

    X_val_bow = tfidf.transform(X_validate)
    print(f'validation accuracy score at max_depth={i}\
 is: {tree.score(X_val_bow, y_validate)}')
    print()

training accuracy score at max_depth=2 is: 0.9461077844311377
validation accuracy score at max_depth=2 is: 0.9085470085470085

training accuracy score at max_depth=3 is: 0.9620758483033932
validation accuracy score at max_depth=3 is: 0.9162393162393162

training accuracy score at max_depth=4 is: 0.9740518962075848
validation accuracy score at max_depth=4 is: 0.9222222222222223

training accuracy score at max_depth=5 is: 0.9820359281437125
validation accuracy score at max_depth=5 is: 0.9256410256410257

training accuracy score at max_depth=6 is: 0.9880239520958084
validation accuracy score at max_depth=6 is: 0.9282051282051282

training accuracy score at max_depth=7 is: 0.9900199600798403
validation accuracy score at max_depth=7 is: 0.9333333333333333

training accuracy score at max_depth=8 is: 0.9920159680638723
validation accuracy score at max_depth=8 is: 0.9299145299145299

training accuracy score at max_depth=9 is: 0.9940119760479041
validation accuracy score at max_depth=9 is: 0.92

In [27]:
X_test_bow = tfidf.transform(X_test)
print(f'test data accuracy score at max_depth={i} is: {tree.score(X_test_b ow, y_test)}')

test data accuracy score at max_depth=10 is: 0.9323250448602922


In [28]:
pd.Series(
    dict(
    zip(tfidf.get_feature_names_out(), 
    tree.feature_importances_))).sort_values(ascending=False).head()

call           0.388579
txt            0.208289
mobile         0.096080
unsubscribe    0.071998
rply           0.037521
dtype: float64