# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['selected', 'stay', '1', '250', 'top', 'briti..."
1,"['love', 'daddy', 'make', 'scream', 'pleasure'..."
2,"['dare', 'stupid', 'wont', 'tell', 'anything',..."
3,"['ok', 'ill', 'send', 'ltdecimalgt', 'ok']"
4,"['ha', 'ha', 'cool', 'cool', 'chikku', 'chikku..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'selected': 6360,
 'stay': 6838,
 '250': 412,
 'top': 7384,
 'british': 1599,
 'hotels': 3700,
 'nothing': 5135,
 'holiday': 3650,
 'worth': 8024,
 '350': 496,
 'claim': 1935,
 'call': 1694,
 'london': 4442,
 '02072069400': 10,
 'bx': 1671,
 '526': 608,
 'sw73ss': 7030,
 'love': 4482,
 'daddy': 2266,
 'make': 4586,
 'scream': 6318,
 'pleasure': 5580,
 'going': 3329,
 'slap': 6579,
 'ass': 1179,
 'dick': 2444,
 'dare': 2287,
 'stupid': 6936,
 'wont': 7990,
 'tell': 7148,
 'anything': 1083,
 'hear': 3543,
 'talk': 7087,
 'ok': 5232,
 'ill': 3821,
 'send': 6372,
 'ltdecimalgt': 4511,
 'ha': 3456,
 'cool': 2120,
 'chikku': 1889,
 'chikkudb': 1891,
 'huh': 3747,
 'means': 4671,
 'computational': 2056,
 'science': 6305,
 'like': 4360,
 'dat': 2298,
 'one': 5258,
 'push': 5850,
 'wifehow': 7922,
 'knew': 4200,
 'time': 7300,
 'murder': 4936,
 'exactly': 2838,
 'got': 3361,
 'new': 5041,
 'year': 8143,
 'cos': 2136,
 'yetunde': 8170,
 'said': 6235,
 'wanted': 7793,
 'surprise': 7020,
 'didnt'

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8241 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.818 / Accuracy: 0.976
