# Creating the model

This notebook contains code to create the model used for the API.

## Data preparation

In [1]:
import glob
import pandas as pd

truthful_files = glob.glob('/Users/jediv/repos/papis-deploy-python-model-apis/model/op_spam/*/truthful*/*/*txt')
deceptive_files = glob.glob('/Users/jediv/repos/papis-deploy-python-model-apis/model/op_spam/*/deceptive*/*/*txt')

def read_file(path):
    with open(path) as f:
        return f.read()
text = map(read_file, truthful_files + deceptive_files)

labels = [True] * len(truthful_files) + [False] * len(deceptive_files)

data = pd.DataFrame(data= list(zip(text, labels)), columns=['text','label'])
data.head()

Unnamed: 0,text,label
0,My $200 Gucci sunglasses were stolen out of my...,True
1,This was a gorgeous hotel from the outside and...,True
2,The hotel is very impressive upon entering and...,True
3,Going to the Internet Retailer 2010 at the las...,True
4,"I checked into this hotel, Rm 1760 on 11/13/20...",True


## Training the model

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_estimators=1000))
                     ])

from sklearn.model_selection import GridSearchCV
parameters = {"clf__n_estimators": [1000],
              "clf__max_depth": [2, 4, 10, None],
              "tfidf__stop_words": [None, 'english']}
model = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='accuracy')

model.fit(data.text.values, data.label.values)

print(model.best_params_)
print(model.best_score_)

{'tfidf__stop_words': None, 'clf__max_depth': None, 'clf__n_estimators': 1000}
0.845625


In [3]:
text_clf.set_params(**model.best_params_)
text_clf.fit(data.text.values, data.label.values)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

## Storing the model

In [4]:
import joblib
joblib.dump(text_clf, 'model.pkl', compress=5)

['model.pkl']

In [5]:
import time
start = time.time()
for _ in range(10):
    model.predict(["I will NEVER stay in this hotel again!"])
end = time.time()
print(end - start)

5.71122598648
