In [1]:
# imports

import pandas as pd
import matplotlib.pyplot as plt

from scripts.predictor import Predictor, ShingleScore
from scripts.text_preprocessor import TextPreprocessor

In [2]:
# Utils for plotting graph

def plot(p, draw):
    p.figure()
    draw()
    p.show()

In [3]:
# Some constants
input_file = 'tmp/mapred-filtered-100.csv'

In [4]:
# Create a list of predictors

predictors = [Predictor(ignore_warnings=True)]
for shingle_len in range(1, 6):
    predictors.append(Predictor(shingle_len_filter=shingle_len, 
                                ignore_warnings=True))

In [5]:
# Load database

db = pd.read_csv(input_file)
db.sample(n=10)

Unnamed: 0,score,k,frequency,shingle
865,2,4,19,dont think ill buying
661,2,2,182,hot water
1984,4,5,34,hours sleep wake hour sleep
2476,5,5,110,dietary allowance rda amounts selected
2228,5,3,322,doesnt taste like
121,1,2,547,product not
964,2,5,19,ive 10 yearsobviously prove tea
2015,5,1,36682,use
1486,3,5,23,petite cuisine web site tested
1684,4,2,392,hot chocolate


In [6]:
# Add shingle scores

for idx, r in db.iterrows():
    shingle_score = ShingleScore(shingle=str(r['shingle']),
                                 score=r['score'],
                                 frequency=r['frequency'])
    for pdtr in predictors:
        pdtr.addShingleScore(shingle_score)

In [7]:
# Load test data

td = pd.read_csv('tmp/processed-30.csv')
td.sample(n=10)

Unnamed: 0,Score,Text
150400,4,left snide comment review packaging reviews in...
24931,5,wolfgang pucks hawaiian hazelnut best kcup haz...
43189,5,read genitals female primates turn bright colo...
111573,5,love lime chili almonds blue diamond great sna...
51435,3,not bad coconut water doesnt come close nirvan...
127517,5,love love love carrs ginger lemon cremesbr br ...
150212,5,perfect ladyfingers making tiramisunot soft ha...
127330,3,magical box lunch merger opening present antic...
45027,5,ive using sauce daily 15 years not wondered so...
154745,5,tried types hot chocolate purchasing keurig fa...


In [8]:
# Preprocess the test data

tp = TextPreprocessor()
td['Processed Text'] = td['Text'].apply(lambda t: tp.process(t))
td.sample(n=10)

Unnamed: 0,Score,Text,Processed Text
84937,5,chocolate lover basket nice mixture chocolatey...,chocolate lover basket nice mixture chocolatey...
136155,5,understand complaints science diet believe avo...,understand complaints science diet believe avo...
7788,5,love cereali sisters hookedi love fact good gr...,love cereali sisters hookedi love fact good gr...
109924,5,great formula great price used use vermont org...,great formula great price used use vermont org...
86241,3,experts say kcup method making coffee wonderfu...,experts say kcup method making coffee wonderfu...
123270,4,probably 1 ramen korea enjoy aware nutrition f...,probably 1 ramen korea enjoy aware nutrition f...
29291,5,excellent product fast delivery excellent pric...,excellent product fast delivery excellent pric...
149646,5,spent weeks looking locally good cinnamon oil ...,spent weeks looking locally good cinnamon oil ...
127680,5,great snackhas good flaver spicyi add little h...,great snackhas good flaver spicyi add little h...
133233,1,tried based positive reviews disappointed just...,tried based positive reviews disappointed just...


In [9]:
# Test the predictors

for idx, pdtr in enumerate(predictors):
    print('Testing Predictor {}...'.format(idx))
    td['Predictor {} Score'.format(idx)] = td['Processed Text'].apply(lambda t: pdtr.predict(t))

Testing Predictor 0...
Testing Predictor 1...
Testing Predictor 2...
Testing Predictor 3...
Testing Predictor 4...
Testing Predictor 5...


In [10]:
td.sample(n=10)

Unnamed: 0,Score,Text,Processed Text,Predictor 0 Score,Predictor 1 Score,Predictor 2 Score,Predictor 3 Score,Predictor 4 Score,Predictor 5 Score
5794,5,lullaby lemon sleep shot 25 ounce 12 pack misc...,lullaby lemon sleep shot 25 ounce 12 pack misc...,4.06358,4.139697,3.987463,3.0,3.0,3.0
114456,5,cat loved itthough dont know sealed smell box ...,cat loved itthough dont know sealed smell box ...,3.747273,3.864292,3.630253,3.0,3.0,3.0
118035,5,chips arrived perfect condition having doubleb...,chips arrived perfect condition having doubleb...,4.392473,4.392473,3.0,3.0,3.0,3.0
33505,4,espresso pods produce good taste nice crema ba...,espresso pods produce good taste nice crema ba...,4.267826,4.116856,4.418795,3.0,3.0,3.0
54531,5,product sat pantry couple weeks decided use hu...,product sat pantry couple weeks decided use hu...,3.759701,3.902181,3.617222,3.0,3.0,3.0
123515,5,purchasing hrefhttpwwwamazoncomgpproductb0001k...,purchasing hrefhttpwwwamazoncomgpproductb0001k...,3.882407,4.14249,3.622323,3.0,3.0,3.0
111919,4,good texture discerning tot approves 4yo enjoy...,good texture discerning tot approves 4yo enjoy...,4.201138,4.201138,3.0,3.0,3.0,3.0
131425,5,nutritionists said youre going eat cereal brea...,nutritionists said youre going eat cereal brea...,4.185885,4.069329,4.302441,3.0,3.0,3.0
114632,3,righti probably liked chips ahoy cookies child...,righti probably liked chips ahoy cookies child...,2.731313,4.0886,1.374026,3.0,3.0,3.0
77375,3,formula works better son horrible spit regular...,formula works better son horrible spit regular...,4.091323,4.091323,3.0,3.0,3.0,3.0


In [11]:
tdd = td.drop(['Text', 'Processed Text'], axis=1)
tdd.sample(n=10)

Unnamed: 0,Score,Predictor 0 Score,Predictor 1 Score,Predictor 2 Score,Predictor 3 Score,Predictor 4 Score,Predictor 5 Score
49984,2,4.177157,4.027105,4.32721,3.0,3.0,3.0
69186,5,4.341981,4.233226,4.450736,3.0,3.0,3.0
134223,4,4.28603,4.28603,3.0,3.0,3.0,3.0
154963,5,4.412546,4.046512,4.778579,3.0,3.0,3.0
134835,5,4.660402,4.320803,5.0,3.0,3.0,3.0
73991,2,3.424373,3.848745,3.0,3.0,3.0,3.0
103647,5,3.96907,3.96907,3.0,3.0,3.0,3.0
146154,5,3.910141,4.141061,4.239966,3.349398,3.0,3.0
26894,5,3.998626,3.998626,3.0,3.0,3.0,3.0
120830,5,4.159343,4.171344,4.147342,3.0,3.0,3.0
