# Data 
The data we used was a collection of tweets from NBC Health.

In [25]:
# Load the required packages
import numpy as np
import pandas as pd
import re
import csv
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, svm
from sklearn.model_selection import (
    train_test_split, learning_curve, StratifiedShuffleSplit, GridSearchCV,
    cross_val_score)

# Improve the readability of figures
sns.set_context('notebook', font_scale=1.4)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [26]:
df = pd.read_table('bbchealth.txt', header=None)
df.head()

Unnamed: 0,0
0,585978391360221184|Thu Apr 09 01:31:50 +0000 2...
1,585947808772960257|Wed Apr 08 23:30:18 +0000 2...
2,585947807816650752|Wed Apr 08 23:30:18 +0000 2...
3,585866060991078401|Wed Apr 08 18:05:28 +0000 2...
4,585794106170839041|Wed Apr 08 13:19:33 +0000 2...


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3929 entries, 0 to 3928
Data columns (total 1 columns):
0    3929 non-null object
dtypes: object(1)
memory usage: 30.8+ KB


We can see that we have 3929 tweets in the collection. 

However, there's metadata before each of the tweet messages and a link to a bbchealth article at the end. So we will have to preprocess the data. 

In [28]:
example = "583659491310219264|Thu Apr 02 15:57:21 +0000 2015|Unsafe food 'growing global threat' http://bbc.in/1BREQDJ"

In [29]:
#removing stopwords

stop_words = nltk.corpus.stopwords.words('english')

In [30]:
# Remove word stems using a Porter stemmer
porter = nltk.PorterStemmer()


In [31]:
def preprocess_text(messy_string):
    assert(type(messy_string) == str)
    cleaned = messy_string
    cleaned = re.sub(r'\d+\|.+\|', 'metadata ', messy_string)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',
                     cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stop_words)
    )

In [32]:
preprocess_text(example)

'metadata unsaf food grow global threat httpaddr'

In [33]:
raw_text = df[0]
processed = raw_text.apply(preprocess_text)

## tokenization
we will tokenize the tweets to break the corpus apart into a vocabulary of unique terms. 

In [37]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_ngrams = vectorizer.fit_transform(processed)

In [38]:
X_ngrams.shape

(3929, 17515)

The tokenization process extracted 17515 unigrams and bigrams from the corpus. Each one of these defines a feature. 