# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['go', 'until', 'jurong', 'point', 'crazy', 'a..."
1,"['ok', 'lar', 'joking', 'wif', 'oni']"
2,"['free', 'entry', 'in', 'wkly', 'comp', 'to', ..."
3,"['dun', 'say', 'so', 'early', 'hor', 'already'..."
4,"['nah', 'don', 'think', 'he', 'goes', 'to', 'u..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'go': 2397,
 'until': 6345,
 'jurong': 3109,
 'point': 4476,
 'crazy': 1292,
 'available': 412,
 'only': 4151,
 'in': 2866,
 'bugis': 794,
 'great': 2467,
 'world': 6751,
 'la': 3218,
 'buffet': 792,
 'cine': 1049,
 'there': 6001,
 'got': 2434,
 'amore': 198,
 'wat': 6544,
 'ok': 4121,
 'lar': 3248,
 'joking': 3078,
 'wif': 6658,
 'oni': 4147,
 'free': 2224,
 'entry': 1854,
 'wkly': 6713,
 'comp': 1149,
 'to': 6109,
 'win': 6671,
 'fa': 1978,
 'cup': 1346,
 'final': 2089,
 'tkts': 6097,
 'st': 5612,
 'may': 3614,
 'text': 5960,
 'receive': 4816,
 'question': 4723,
 'std': 5651,
 'txt': 6264,
 'rate': 4771,
 'apply': 278,
 'over': 4226,
 'dun': 1718,
 'say': 5113,
 'so': 5464,
 'early': 1736,
 'hor': 2718,
 'already': 172,
 'then': 5997,
 'nah': 3888,
 'don': 1634,
 'think': 6017,
 'he': 2596,
 'goes': 2403,
 'usf': 6384,
 'lives': 3388,
 'around': 326,
 'here': 2633,
 'though': 6036,
 'freemsg': 2229,
 'hey': 2643,
 'darling': 1397,
 'it': 2984,
 'been': 546,
 'week': 6586,
 'now': 40

In [16]:
# How are these vectors stored?
count = 0 
for i in X_test_vect : 
    print( i )
    count = count + 1 
    
print(count)

  (0, 35)	0.39924327232492607
  (0, 1460)	0.3854117091705019
  (0, 1531)	0.3367746085764107
  (0, 2531)	0.39924327232492607
  (0, 2866)	0.1409520756925724
  (0, 3832)	0.23413944612706034
  (0, 4701)	0.2973024083877418
  (0, 5065)	0.3130966659516033
  (0, 5349)	0.3585057534149899
  (0, 5464)	0.1720956393680722
  (0, 371)	0.13313933337491193
  (0, 604)	0.3015830254758794
  (0, 859)	0.23454539209064654
  (0, 874)	0.2764089683384564
  (0, 1380)	0.28303061150002445
  (0, 1405)	0.2407867334800146
  (0, 2224)	0.3026066212556698
  (0, 2292)	0.2764089683384564
  (0, 2382)	0.2764089683384564
  (0, 2397)	0.14711156988948149
  (0, 2973)	0.10843695153447663
  (0, 3755)	0.1749638784664759
  (0, 3832)	0.17686580448818992
  (0, 5193)	0.15648565958770433
  (0, 5682)	0.17143503809337887
  (0, 6025)	0.14051862565267126
  (0, 6109)	0.31806625055822013
  (0, 6287)	0.194013813748427
  (0, 6601)	0.25123491120103336
  (0, 6871)	0.08127115529169548
  (0, 15)	0.13228890523207038
  (0, 104)	0.14868434831062324
 

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

# y_train are labels in the format of pandas dataframe column 
# but scikit-learn needs it in array format therefore y_train.values.ravel() 

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict( X_test_vect ) 

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.828 / Accuracy: 0.978


In [9]:
print( len(y_test))
print( len(y_pred))

1115
1115
