In [118]:
from requests import get
from bs4 import BeautifulSoup
from os import path

def get_article_text():
    # if we already have the data, read it locally
    if path.exists('article.txt'):
        with open('article.txt') as f:
            return f.read()

    # otherwise go fetch the data
    url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
    headers = {'User-Agent': 'Codeup Ada Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text)
    article = soup.find('div', class_='mk-single-content')

    # save it for next time
    with open('article.txt', 'w') as f:
        f.write(article.text)

    return article.text

In [119]:
get_article_text()

'\nThe rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Stude

In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import confusion_matrix
import re

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
% matplotlib inline

seed=123

# df = pd.read_csv('spam.csv')
colnames = ['label','SMS']
df = pd.read_csv('spam.csv', skiprows=1, names=colnames, 
                 encoding='latin-1', usecols=[0,1])

In [121]:
df.label.value_counts(dropna=False)

ham     4825
spam     747
Name: label, dtype: int64

In [122]:
df = df.sample(747, random_state=seed)

In [123]:
df.head()

Unnamed: 0,label,SMS
3237,ham,Good. No swimsuit allowed :)
843,spam,Urgent! call 09066350750 from your landline. Y...
3521,ham,Im sorry bout last nite it wasnåÕt ur fault it...
2123,spam,+123 Congratulations - in this week's competit...
738,ham,Wish i were with you now!


In [124]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [129]:
def word_extraction(msg):
    # converting messages to lowercase
    msg = msg.lower()
    msg = re.sub('[^a-z]+', ' ', msg).split()
    return msg

In [130]:
# Processing text messages
df['word_list'] = df['SMS'].apply(word_extraction)

In [131]:
df.head()

Unnamed: 0,label,SMS,word_list
3237,ham,Good. No swimsuit allowed :),"[good, no, swimsuit, allowed]"
843,spam,Urgent! call 09066350750 from your landline. Y...,"[urgent, call, from, your, landline, your, com..."
3521,ham,Im sorry bout last nite it wasnåÕt ur fault it...,"[im, sorry, bout, last, nite, it, wasn, t, ur,..."
2123,spam,+123 Congratulations - in this week's competit...,"[congratulations, in, this, week, s, competiti..."
738,ham,Wish i were with you now!,"[wish, i, were, with, you, now]"


In [132]:
def tokenize(word_lists):
    words = []
    for word_list in word_lists:
        words.extend(word_list)
    words = sorted(list(set(words)))
    return words

In [133]:
tokenize(df.word_list)

from sklearn.feature_extraction.text import CountVectorizer
# create the vectorizer object
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))

# fit and transform on the original docs
features = vectorizer.fit_transform(orig_docs)    

# convert to a dense matrix
features = features.todense()

# look at the vector for each of the first 10 documents
print(features[:10])

['a',
 'aaooooright',
 'aathi',
 'abiola',
 'able',
 'about',
 'absolutely',
 'abt',
 'abta',
 'ac',
 'acc',
 'accept',
 'access',
 'accidentally',
 'account',
 'achan',
 'ache',
 'acknowledgement',
 'acnt',
 'across',
 'actin',
 'activate',
 'activities',
 'actually',
 'ad',
 'added',
 'address',
 'admirer',
 'admit',
 'adp',
 'adult',
 'advance',
 'advice',
 'advise',
 'ae',
 'affairs',
 'afraid',
 'aft',
 'after',
 'afternoon',
 'afternoons',
 'aftr',
 'ag',
 'again',
 'agalla',
 'age',
 'ago',
 'ah',
 'aha',
 'ahead',
 'ahold',
 'aight',
 'air',
 'airport',
 'aiyah',
 'alaipayuthe',
 'alertfrom',
 'algorithms',
 'alive',
 'all',
 'allah',
 'allowed',
 'almost',
 'alone',
 'along',
 'already',
 'alright',
 'alrite',
 'also',
 'always',
 'am',
 'amazing',
 'ambrith',
 'amk',
 'amla',
 'amma',
 'among',
 'amount',
 'amp',
 'amt',
 'amused',
 'an',
 'and',
 'anderson',
 'angry',
 'animation',
 'announced',
 'announcement',
 'another',
 'ans',
 'answer',
 'answers',
 'antha',
 'anti',
 

In [35]:
# train test split 
X_train, X_test, y_train, y_test = train_test_split(df['SMS'], df['label'], test_size = 0.1, random_state = 1)

In [36]:
# training vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [37]:
# training the classifier 
svm = svm.SVC(C=1000)
svm.fit(X_train, y_train)

# testing against testing set 
X_test = vectorizer.transform(X_test)
y_pred = svm.predict(X_test) 
print(confusion_matrix(y_test, y_pred))



[[490   0]
 [ 10  58]]


In [38]:
# test against new messages 
def pred(msg):
    msg = vectorizer.transform([msg])
    prediction = svm.predict(msg)
    return prediction[0]

In [39]:
pred('Whats up??')

'ham'