# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['ill', '3', 'mins', 'look']"
1,"['love', 'want', 'flood', 'pretty', 'pussy', '..."
2,"['dont', 'think', 'dont', 'need', 'going', 'la..."
3,"['networking', 'job']"
4,"['walked', 'moms', 'right', 'stagwood', 'pass'..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'ill': 3794,
 'mins': 4748,
 'look': 4416,
 'love': 4450,
 'want': 7717,
 'flood': 3035,
 'pretty': 5695,
 'pussy': 5800,
 'cum': 2233,
 'dont': 2554,
 'think': 7170,
 'need': 4978,
 'going': 3316,
 'late': 4243,
 'school': 6233,
 'night': 5027,
 'especially': 2776,
 'one': 5221,
 'class': 1947,
 'missed': 4760,
 'last': 4240,
 'wednesday': 7772,
 'probably': 5726,
 'failed': 2887,
 'test': 7099,
 'friday': 3130,
 'networking': 4998,
 'job': 4017,
 'walked': 7702,
 'moms': 4813,
 'right': 6071,
 'stagwood': 6730,
 'pass': 5386,
 'winterstone': 7874,
 'left': 4284,
 'victors': 7621,
 'hill': 3591,
 'address': 888,
 'ltgt': 4483,
 'wot': 7955,
 'hi': 3581,
 'jon': 4036,
 'pete': 5459,
 'ive': 3957,
 'bin': 1438,
 'spain': 6651,
 'recently': 5921,
 'hav': 3503,
 'sum': 6903,
 'dinero': 2469,
 'bill': 1431,
 'said': 6173,
 'ur': 7539,
 'åôrents': 8154,
 'mayb': 4633,
 'interested': 3883,
 '12000pes': 277,
 'around': 1149,
 '48': 555,
 'tb': 7038,
 'james': 3975,
 'ringtone': 6078,
 'club'

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8165 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.784 / Accuracy: 0.969
