<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1">Load Data</a></span><ul class="toc-item"><li><span><a href="#Train-Test-Split" data-toc-modified-id="Train-Test-Split-1.1">Train-Test Split</a></span></li></ul></li><li><span><a href="#Clean-Data" data-toc-modified-id="Clean-Data-2">Clean Data</a></span></li><li><span><a href="#Explore" data-toc-modified-id="Explore-3">Explore</a></span></li><li><span><a href="#Simple-Model" data-toc-modified-id="Simple-Model-4">Simple Model</a></span><ul class="toc-item"><li><span><a href="#Validation-(w/-Cross-Validation)" data-toc-modified-id="Validation-(w/-Cross-Validation)-4.1">Validation (w/ Cross-Validation)</a></span></li></ul></li><li><span><a href="#Evaluate-with-Test" data-toc-modified-id="Evaluate-with-Test-5">Evaluate with Test</a></span></li></ul></div>

In [1]:
from typing import Optional
import requests
from zipfile import ZipFile
from io import BytesIO

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load Data

In [2]:
def load_data(
    url: str,
    filename: str,
    zip_filename: Optional[str] = None,
    **kwargs,
 ) -> pd.DataFrame:
    '''Download zip file with CSV from URL and returns a DataFrame.
    '''
    if zip_filename is None:
        zip_filename = url.split('/')[-1]

    r = requests.get(url)
    files = ZipFile(BytesIO(r.content))
    # No header
    return pd.read_csv(files.open(filename), **kwargs)

def load_spam_data() -> pd.DataFrame:
    '''Returns DataFrame for SMS spam detection (https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection)
    '''
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    filename = 'SMSSpamCollection'
    df = load_data(url, filename, sep='\t', names=['label', 'text'])
    return df

In [3]:
df = load_spam_data()
display(df.head())
display(df['label'].value_counts())

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


ham     4825
spam     747
Name: label, dtype: int64

## Train-Test Split

In [4]:
def get_train_test_data(X, y) -> tuple:
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=27,
    )
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = get_train_test_data(
                                        X=df['text'],
                                        y=df['label'],
)


# Check that this looks good (train & test should look "similar")
print(f'Train (n={y_train.shape[0]:,}):')
for label, count in zip(*np.unique(y_train, return_counts=True)):
    print(f'{label}\t{count/y_train.shape[0]:.2%}') # Normalize Counts

print()
print(f'Test (n={y_test.shape[0]:,}):')
for label, count in zip(*np.unique(y_test, return_counts=True)):
    print(f'{label}\t{count/y_test.shape[0]:.2%}') # Normalize Counts

Train (n=4,457):
ham	86.90%
spam	13.10%

Test (n=1,115):
ham	85.38%
spam	14.62%


# Clean Data

> We'll "skip" this and just say it's clean enough but something #TODO

In [6]:
def get_clean_data(data):
    '''Dummy clean data (pass data through)
    '''
    # TODO: Actually clean the data with some process
    return data

# Explore

# Simple Model

## Validation (w/ Cross-Validation)

# Evaluate with Test