In [1]:
# imports

import pandas as pd
import matplotlib.pyplot as plt

from scripts.predictor import Predictor, ShingleScore
from scripts.text_preprocessor import TextPreprocessor

In [2]:
# Utils for plotting graph

def plot(p, draw):
    p.figure()
    draw()
    p.show()

In [3]:
# Create a list of predictors

predictors = [Predictor(ignore_warnings=True)]
for shingle_len in range(1, 6):
    predictors.append(Predictor(shingle_len_filter=shingle_len, 
                                ignore_warnings=True))

In [4]:
# Load database

db = pd.read_csv('tmp/mapred-filtered-100.csv')
db.sample(n=10)

Unnamed: 0,score,k,frequency,shingle
816,2,4,21,increases milk production yeah
330,1,4,124,bored thought little nutty
2266,5,3,229,thought id try
1911,4,5,37,brands chamomile tea bags dont
2209,5,3,567,goes long way
405,1,5,124,58 cup total eats morning
1913,4,5,37,chamomile herbal tea tea bags
512,2,1,3632,food
942,2,5,19,couple days zero effect stuffy
808,2,4,21,3 weeks aside doesnt


In [5]:
# Add shingle scores

for idx, r in db.iterrows():
    shingle_score = ShingleScore(shingle=r['shingle'],
                                 score=r['score'],
                                 frequency=r['frequency'])
    for pdtr in predictors:
        pdtr.addShingleScore(shingle_score)

In [6]:
# Load test data

td = pd.read_csv('tmp/processed-30.csv')
td.sample(n=10)

Unnamed: 0,Score,Text
147815,5,read dark chocolate high cocoa content healthy...
150153,5,buying toasted southern pecan years recently s...
83278,5,im not youd big starbucks fan far stores conce...
119957,5,received tree days ordered itit arrived perfec...
126306,5,onion power fine like baby powder great flavor...
26966,5,just fostered dog days ago loves toyit definit...
93163,1,95 ounce jar jelly works 147 ounce today gold ...
10936,4,switch ground really enjoyed bean cafe altura ...
142062,5,wasnt 3lb ordered scrodcod smaller size fisher...
121137,5,good stuffs stuffs say hawaiibr br hard grocer...


In [7]:
# Preprocess the test data

tp = TextPreprocessor()
td['Processed Text'] = td['Text'].apply(lambda t: tp.process(t))
td.sample(n=10)

Unnamed: 0,Score,Text,Processed Text
27220,5,using stevita saw product added fiber im looki...,using stevita saw product added fiber im looki...
166120,5,bought food adopted cat strayed brand purchase...,bought food adopted cat strayed brand purchase...
79178,5,using product years live kitchenit adds wonder...,using product years live kitchenit adds wonder...
64323,4,did googling kitten nutrition food feeding kit...,did googling kitten nutrition food feeding kit...
390,5,use frequently hard locally great baked goodsi...,use frequently hard locally great baked goodsi...
17294,5,gone vet got dog absolutely loves foul smell d...,gone vet got dog absolutely loves foul smell d...
42911,1,returned item contains palm oilbad body bad pl...,returned item contains palm oilbad body bad pl...
160769,1,contains soy wish mentioned details soy negati...,contains soy wish mentioned details soy negati...
51735,5,product turned great baby finally eats baby ce...,product turned great baby finally eats baby ce...
22615,5,good love fact just dry roosted added definite...,good love fact just dry roosted added definite...


In [8]:
# Test the predictors

for idx, pdtr in enumerate(predictors):
    print('Testing Predictor {}...'.format(idx))
    td['Predictor {} Score'.format(idx)] = td['Processed Text'].apply(lambda t: pdtr.predict(t))

Testing Predictor 0...
Testing Predictor 1...
Testing Predictor 2...
Testing Predictor 3...
Testing Predictor 4...
Testing Predictor 5...


In [12]:
td.sample(n=5)

Unnamed: 0,Score,Text,Processed Text,Predictor 0 Score,Predictor 1 Score,Predictor 2 Score,Predictor 3 Score,Predictor 4 Score,Predictor 5 Score
82745,5,ate chips jet blue flighti immediately came ho...,ate chips jet blue flighti immediately came ho...,4.002801,4.002801,3.0,3.0,3.0,3.0
94225,5,cocktails luxe treatment luxardo maraschino ch...,cocktails luxe treatment luxardo maraschino ch...,4.193809,4.098555,4.289062,3.0,3.0,3.0
150700,5,ordered yesterday arrived today delivered nice...,ordered yesterday arrived today delivered nice...,4.007223,4.007223,3.0,3.0,3.0,3.0
132208,5,dehydrated drinking bottle water not helpingi ...,dehydrated drinking bottle water not helpingi ...,3.850361,3.850361,3.0,3.0,3.0,3.0
113308,5,bit needed order id hard white peppercorns loc...,bit needed order id hard white peppercorns loc...,4.094961,4.099431,4.090492,3.0,3.0,3.0
