# NLP Example

### Read in text

In [None]:
import nltk

nltk.download('stopwords')

In [None]:
import pandas as pd
import string

stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

### Create feature for text message length

In [None]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))

data.head()

### Create feature for % of text that is punctuation

In [None]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['punct%'] = data['body_text'].apply(lambda x: count_punctuation(x))

data.head()

### Evaluate created features

In [None]:
from matplotlib import pyplot
import numpy as np
%matplotlib inline

In [None]:
bins = np.linspace(0,200,40)
pyplot.hist(data['body_len'], bins)
pyplot.title("Body Length Transform")
pyplot.show()

### Transforms

In [None]:
# box-cox power transforms
for i in [1,2,3,4,5]:
    pyplot.hist(data['punct%']**(1/i), 40)
    pyplot.title("Body Length Transform")
    pyplot.show()


In [None]:
data["punct%"] = data['punct%']**(1/i)

## Analyze Texts

In [None]:
import re

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data["body_text"])
X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis = 1)
X_features.head()

# Final Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)