In [1]:
import numpy as np
from collections import defaultdict
import nltk 
import pandas as pd
import os 
import json
import io
import gzip

The code shown below is from DSC3 Datathon

In [2]:
import random
import ast

References:
- random sampler fast: http://metadatascience.com/2014/02/27/random-sampling-from-very-large-files/
- decode the dict: https://stackoverflow.com/questions/49184578/how-to-convert-bytes-type-to-dictionary
- stops: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [3]:
#have a random sample so it's more representative of the population
random.seed(42)
def random_sampler(filename, k):
    sample = []
    with open(filename, 'rb') as f:
        f.seek(0, 2)
        filesize = f.tell()
        random_set = sorted(random.sample(range(filesize), k))

        for i in range(k):
            f.seek(random_set[i])
            # Skip current line (because we might be in the middle of a line) 
            f.readline()
            # Append the next line to the sample set 
            val = ast.literal_eval(f.readline().rstrip().decode('utf-8'))
            sample.append(val)

    return sample

In [4]:
places2 = random_sampler("places.clean.json", 2000)


In [5]:
%%time
#parsing in the data, takes time
places = random_sampler("places.clean.json", 200000)
users = random_sampler("users.clean.json", 200000)
reviews = random_sampler("reviews.clean.json", 200000)

Wall time: 3min 43s


In [6]:
#making the dict into dataframes
placesdf = pd.DataFrame(places)
usersdf = pd.DataFrame(users)
reviewsdf = pd.DataFrame(reviews)

In [7]:
full = placesdf.merge(reviewsdf,how = "outer")

In [8]:
len(full)

389201

In [120]:
#get rid of rows without address
df = full[full["address"].notna()]
df = df[df["rating"].notna()]
df = df[df["reviewText"].notna()]
#df2 = df[df["price"].notna()]#

In [121]:
len(df)

9527

In [122]:
df.head()

Unnamed: 0,address,closed,gPlusPlaceId,gps,hours,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime
9,"[Naunynstraße 60, 10997 Berlin, Germany]",False,103592273547758141036,"[52.502139, 13.420115]",,Trinkteufel,030 6147128,,[Pub],107854905670749394534,5.0,Punky and Funky. Be careful if you are normal.,"Sep 3, 2012",Alex Parasense,1346682000.0
23,"[6835 State Road 54, New Port Richey, FL 34653]",False,112761994004427009119,"[28.217362, -82.700674]","[[Monday, [['8:00 am--6:00 pm']]], [Tuesday, [...",New Port Richey Florist,(727) 849-2222,,,118059946268191865657,5.0,Joanne opened the shop during Memorial Day wee...,,Jennifer Shorter,
40,"[San Martín 3400, 3400 Corrientes, Argentina]",False,118235232758603031011,"[-27.471143, -58.82315]",,General Belgrano Bridge,03783 42-7200,,[Bridge],105611792260212147994,2.0,La dirección proporcionada por Google Maps no ...,"Dec 16, 2013",julio taborda,1387228000.0
42,"[San Martín 3400, 3400 Corrientes, Argentina]",False,118235232758603031011,"[-27.471143, -58.82315]",,General Belgrano Bridge,03783 42-7200,,[Bridge],105611792260212147994,2.0,La dirección proporcionada por Google Maps no ...,"Dec 16, 2013",julio taborda,1387228000.0
89,"[3212 W 23rd St #2, Panama City, FL 32405]",False,100322374440868401385,"[30.190242, -85.703636]","[[Monday, [['Closed']]], [Tuesday, [['11:00 am...",Salon du Soleil,(850) 215-6699,,[Beauty Salon],102696293482965494050,5.0,Salon du Soleil has been my spa of choice for ...,"Apr 17, 2013",Monica Rennspies,1366229000.0


In [123]:
df.to_csv("final_dataset_untranslated.csv")

Now we have a df with the rating and the values, we're going to translate into english, but there's no need to translate the already english ones, so I'm removing the reviews with english stop words and only translating the ones without english stop words.

In [124]:
import nltk
from nltk.corpus import stopwords
common_words = ["Good", "good", "ok", "best", "awesome", "Awesome", "service"]
stops = set(stopwords.words('english')).union(set(common_words))


In [125]:
#scoring english based on how many english stop words there are
df["english score"] = df["reviewText"].apply(lambda x: sum([int(term.lower() in stops) for term in x.split()]))

In [126]:
from langdetect import detect

In [127]:
from collections.abc import Iterable

In [128]:
isinstance(np.nan, Iterable)
str(np.nan)

'nan'

In [129]:
def remove_from_list(x):
    """function to remove lists from the dataframe, making the address into
    a string instead"""
    if isinstance(x, Iterable):
        return " ".join(x)
    else:
        return str(x)

In [130]:
def remove_from_list_geo(x):
    """a modified remove_from_list to modify geo data 
    to get long and lat into a tuple"""
    if isinstance(x, Iterable):
        return tuple(x)
    else:
        return str(x)

In [131]:
"".join(["hi this is a list"])

'hi this is a list'

In [132]:
#df["address"] = 
df["address"] = df["address"].apply(remove_from_list)

In [133]:
df["gps"] = df["gps"].apply(remove_from_list_geo)

In [134]:
#df["categories"] = 
df["categories"]= df["categories"].apply(remove_from_list)

In [135]:
clean = df.drop(["hours"], axis =1)

In [136]:
new_df= clean.drop_duplicates()

In [137]:
new_df.head(2)

Unnamed: 0,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime,english score
9,Naunynstraße 60 10997 Berlin Germany,False,103592273547758141036,"(52.502139, 13.420115)",Trinkteufel,030 6147128,,Pub,107854905670749394534,5.0,Punky and Funky. Be careful if you are normal.,"Sep 3, 2012",Alex Parasense,1346682000.0,5
23,"6835 State Road 54 New Port Richey, FL 34653",False,112761994004427009119,"(28.217362, -82.700674)",New Port Richey Florist,(727) 849-2222,,,118059946268191865657,5.0,Joanne opened the shop during Memorial Day wee...,,Jennifer Shorter,,15


In [165]:
len(new_df)

9041

In [138]:
potentially_nonenglish = new_df[new_df["english score"] <= 1]

In [139]:
potentially_nonenglish.head(10)

Unnamed: 0,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime,english score
40,San Martín 3400 3400 Corrientes Argentina,False,118235232758603031011,"(-27.471143, -58.82315)",General Belgrano Bridge,03783 42-7200,,Bridge,105611792260212147994,2.0,La dirección proporcionada por Google Maps no ...,"Dec 16, 2013",julio taborda,1387228000.0,1
123,Mueang Khon Kaen District Khon Kaen Thailand,False,101466126288249455760,"(16.413289, 102.838298)",บึงแก่นนคร,,,Tourist Attraction,102595007700026792843,3.0,สามดาวพอนะ เพราะบางที เเม่งขี้เดา พากู หลงเส้น...,"Oct 3, 2013",Nitchkarn Seebunruang,1380786000.0,0
126,"Đường Không Tên Giang Điền, Trảng Bom Đồng Nai...",False,101927452218854067749,"(10.912831, 106.986051)",Khu Du lịch Sinh thái thác Giang Điền,061 3923 930,,Du Lịch - Du Lịch Sinh Thái,110208324157728215085,5.0,Tot qua,"Jul 30, 2013",Tram Phan,1375188000.0,0
173,"W Tropicana Ave Las Vegas, NV 89147",False,101762032948767959677,"(36.100717, -115.302022)",In-N-Out Burger,(800) 786-1000,$$$,Hamburger Restaurant Fast Food Restaurant Beve...,114543247800188081386,5.0,Awesome stuff!,"Jul 27, 2013",Samantha Walker,1374959000.0,1
307,"Rua Camboriú, 647 SC 88301-450, Brazil",False,112310896318623723556,"(-26.913699, -48.655571)",Beth Bistro e Restaurante,(47) 3444-5516,,,114907615512867408757,5.0,"Ambiente muito agradável, ótimo atendimento e ...",,André A.,,0
324,H.L.M 4 Dakar Senegal,False,111510704075044006045,"(14.706486, -17.442727)",Ecole HLM4 C/D,,,School,112735649947939669150,5.0,J adore,"Sep 3, 2013",Ndeye Gnima Mandiang,1378247000.0,0
397,"Via Lavoratori Autobianchi, 1 20033 Desio MB I...",False,111532397928835458938,"(45.62842, 9.211563)",Eurotaverna,0362 300046,,Restaurant,115741219409509291006,5.0,È molto bello io ci vado sempre,"Apr 14, 2013",Camilla Casorati,1365963000.0,0
560,"Al Rawdah, Al Faisaliyah Jeddah 23442 Saudi Ar...",False,116378342621164987320,"(21.567553, 39.180532)",شيزان,02 663 5180,,Indian Restaurant,106494967655449429160,5.0,فيه شباب ولا بس عوائل,"Sep 21, 2013",alwaleed bawghash,1379770000.0,0
608,"Japan 〒163-0248 Tokyo, Shinjuku, Nishishinjuku...",False,107255972992272619664,"(35.691296, 139.692623)",平和祈念展示資料館,03-5323-8709,,,102881488089763968454,4.0,戦渦の、そして戦争直後に、日本人が辿ってきた過酷な日々と筆舌しがたい労苦を今に語り継ぐ資料館...,,Hiroaki Kaneko,,0
624,"15 Đường Số 20 Linh Chiểu, Thủ Đức Hồ Chí Minh...",False,107631372203364761720,"(10.856377, 106.76408)",Quán Cà Phê Hoa Hồng,,,Giải Trí - Café,111475335699271889477,2.0,Quán uống dở ẹc toàn khói thuốc.,"Nov 17, 2012",minh khue pham,1353177000.0,0


In [73]:
len(potentially_nonenglish)


2465

In [74]:
from nltk.corpus import words
word_list = words.words()

In [76]:
to_translate = potentially_nonenglish["reviewText"]

Using Google's API to translate text.

In [34]:
import google.cloud.translate 

https://stackoverflow.com/questions/45501082/set-google-application-credentials-in-python-project-to-use-google-api

In [35]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="Datathon-ad4a6dac1782.json"

In [42]:
# Imports the Google Cloud client library
from google.cloud import translate

# Instantiates a client
translate_client = translate.Client()

# The text to translate
text = u'很多料加在同一碗粥裡，賣的很貴，但味道根本不搭。'
# The target language
target = 'en'

# Translates some text into Russian
#translation = translate_client.translate(
 #   text,
  #  target_language=target)

#print(u'Text: {}'.format(text))
#print(u'Translation: {}'.format(translation['translatedText']))

In [43]:
def translate(text):
    translation = translate_client.translate(
        text,
        target_language='en')
    return u'Translation: {}'.format(translation['translatedText'])

In [85]:
to_translate_small = to_translate.iloc[:100]
to_translated_rest = to_translate.iloc[100:]

In [79]:
translated = []

In [86]:
translated_2 = []

In [82]:
import time

for foreign in to_translated_rest:
    translated_2.append(translate(foreign))
    time.sleep(0.002)
    

In [84]:
translated

['Translation: The address provided by Google Maps is not accurate. The nearest location would be Avenida Costanera Gral. San Martin intersection with Pedro Ferre Avenue',
 'Translation: The address provided by Google Maps is not accurate. The nearest location would be Avenida Costanera Gral. San Martin intersection with Pedro Ferre Avenue',
 'Translation: Three stars enough, because sometimes Sam Ngao Guang Pa Koo Long, the route of the bus',
 'Translation: Tot through',
 'Translation: Awesome stuff!',
 'Translation: Very nice atmosphere, great service and great food.',
 'Translation: I just love it',
 'Translation: It is very beautiful I always go there',
 'Translation: There are young people, but not families',
 'Translation: It is a museum that tells us about the hard days that Japanese people have followed and the hardships that it is difficult to write down after the war and immediately after the war. This library is roughly organized in three parts. Life in the battlefield of th

In [104]:
all_translated = pd.Series(translated + translated_2)

In [110]:
#all_translated.to_csv("translated.csv")
all_reviews = all_translated[1:]

In [152]:
reviews3 = all_reviews.reset_index().drop("index", axis = 1)

In [153]:
reviews3.head()

Unnamed: 0,0
0,Translation: The address provided by Google Ma...
1,"Translation: Three stars enough, because somet..."
2,Translation: Tot through
3,Translation: Awesome stuff!
4,"Translation: Very nice atmosphere, great servi..."


In [143]:
noneng = potentially_nonenglish.reset_index() #, all_reviews], ignore_index = True)
noneng.head(5)

Unnamed: 0,index,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime,english score
0,40,San Martín 3400 3400 Corrientes Argentina,False,118235232758603031011,"(-27.471143, -58.82315)",General Belgrano Bridge,03783 42-7200,,Bridge,105611792260212147994,2.0,La dirección proporcionada por Google Maps no ...,"Dec 16, 2013",julio taborda,1387228000.0,1
1,123,Mueang Khon Kaen District Khon Kaen Thailand,False,101466126288249455760,"(16.413289, 102.838298)",บึงแก่นนคร,,,Tourist Attraction,102595007700026792843,3.0,สามดาวพอนะ เพราะบางที เเม่งขี้เดา พากู หลงเส้น...,"Oct 3, 2013",Nitchkarn Seebunruang,1380786000.0,0
2,126,"Đường Không Tên Giang Điền, Trảng Bom Đồng Nai...",False,101927452218854067749,"(10.912831, 106.986051)",Khu Du lịch Sinh thái thác Giang Điền,061 3923 930,,Du Lịch - Du Lịch Sinh Thái,110208324157728215085,5.0,Tot qua,"Jul 30, 2013",Tram Phan,1375188000.0,0
3,173,"W Tropicana Ave Las Vegas, NV 89147",False,101762032948767959677,"(36.100717, -115.302022)",In-N-Out Burger,(800) 786-1000,$$$,Hamburger Restaurant Fast Food Restaurant Beve...,114543247800188081386,5.0,Awesome stuff!,"Jul 27, 2013",Samantha Walker,1374959000.0,1
4,307,"Rua Camboriú, 647 SC 88301-450, Brazil",False,112310896318623723556,"(-26.913699, -48.655571)",Beth Bistro e Restaurante,(47) 3444-5516,,,114907615512867408757,5.0,"Ambiente muito agradável, ótimo atendimento e ...",,André A.,,0


In [154]:
noneng["translation"] = reviews3

In [169]:
translating = noneng.set_index("index")
len(translating)

2465

In [170]:
english = new_df[new_df["english score"]>1] #untranslated
translating #df2 translated

Unnamed: 0_level_0,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime,english score,translation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
40,San Martín 3400 3400 Corrientes Argentina,False,118235232758603031011,"(-27.471143, -58.82315)",General Belgrano Bridge,03783 42-7200,,Bridge,105611792260212147994,2.0,La dirección proporcionada por Google Maps no ...,"Dec 16, 2013",julio taborda,1.387228e+09,1,Translation: The address provided by Google Ma...
123,Mueang Khon Kaen District Khon Kaen Thailand,False,101466126288249455760,"(16.413289, 102.838298)",บึงแก่นนคร,,,Tourist Attraction,102595007700026792843,3.0,สามดาวพอนะ เพราะบางที เเม่งขี้เดา พากู หลงเส้น...,"Oct 3, 2013",Nitchkarn Seebunruang,1.380786e+09,0,"Translation: Three stars enough, because somet..."
126,"Đường Không Tên Giang Điền, Trảng Bom Đồng Nai...",False,101927452218854067749,"(10.912831, 106.986051)",Khu Du lịch Sinh thái thác Giang Điền,061 3923 930,,Du Lịch - Du Lịch Sinh Thái,110208324157728215085,5.0,Tot qua,"Jul 30, 2013",Tram Phan,1.375188e+09,0,Translation: Tot through
173,"W Tropicana Ave Las Vegas, NV 89147",False,101762032948767959677,"(36.100717, -115.302022)",In-N-Out Burger,(800) 786-1000,$$$,Hamburger Restaurant Fast Food Restaurant Beve...,114543247800188081386,5.0,Awesome stuff!,"Jul 27, 2013",Samantha Walker,1.374959e+09,1,Translation: Awesome stuff!
307,"Rua Camboriú, 647 SC 88301-450, Brazil",False,112310896318623723556,"(-26.913699, -48.655571)",Beth Bistro e Restaurante,(47) 3444-5516,,,114907615512867408757,5.0,"Ambiente muito agradável, ótimo atendimento e ...",,André A.,,0,"Translation: Very nice atmosphere, great servi..."
324,H.L.M 4 Dakar Senegal,False,111510704075044006045,"(14.706486, -17.442727)",Ecole HLM4 C/D,,,School,112735649947939669150,5.0,J adore,"Sep 3, 2013",Ndeye Gnima Mandiang,1.378247e+09,0,Translation: I just love it
397,"Via Lavoratori Autobianchi, 1 20033 Desio MB I...",False,111532397928835458938,"(45.62842, 9.211563)",Eurotaverna,0362 300046,,Restaurant,115741219409509291006,5.0,È molto bello io ci vado sempre,"Apr 14, 2013",Camilla Casorati,1.365963e+09,0,Translation: It is very beautiful I always go ...
560,"Al Rawdah, Al Faisaliyah Jeddah 23442 Saudi Ar...",False,116378342621164987320,"(21.567553, 39.180532)",شيزان,02 663 5180,,Indian Restaurant,106494967655449429160,5.0,فيه شباب ولا بس عوائل,"Sep 21, 2013",alwaleed bawghash,1.379770e+09,0,"Translation: There are young people, but not f..."
608,"Japan 〒163-0248 Tokyo, Shinjuku, Nishishinjuku...",False,107255972992272619664,"(35.691296, 139.692623)",平和祈念展示資料館,03-5323-8709,,,102881488089763968454,4.0,戦渦の、そして戦争直後に、日本人が辿ってきた過酷な日々と筆舌しがたい労苦を今に語り継ぐ資料館...,,Hiroaki Kaneko,,0,Translation: It is a museum that tells us abou...
624,"15 Đường Số 20 Linh Chiểu, Thủ Đức Hồ Chí Minh...",False,107631372203364761720,"(10.856377, 106.76408)",Quán Cà Phê Hoa Hồng,,,Giải Trí - Café,111475335699271889477,2.0,Quán uống dở ẹc toàn khói thuốc.,"Nov 17, 2012",minh khue pham,1.353177e+09,0,Translation: Stop drinking all the smoke.


In [171]:
final = pd.concat([english, translating], sort = False)

In [173]:
final.to_csv("translated_final_df.csv")

In [178]:
final

Unnamed: 0,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime,english score,translation
9,Naunynstraße 60 10997 Berlin Germany,False,103592273547758141036,"(52.502139, 13.420115)",Trinkteufel,030 6147128,,Pub,107854905670749394534,5.0,Punky and Funky. Be careful if you are normal.,"Sep 3, 2012",Alex Parasense,1.346682e+09,5,
23,"6835 State Road 54 New Port Richey, FL 34653",False,112761994004427009119,"(28.217362, -82.700674)",New Port Richey Florist,(727) 849-2222,,,118059946268191865657,5.0,Joanne opened the shop during Memorial Day wee...,,Jennifer Shorter,,15,
89,"3212 W 23rd St #2 Panama City, FL 32405",False,100322374440868401385,"(30.190242, -85.703636)",Salon du Soleil,(850) 215-6699,,Beauty Salon,102696293482965494050,5.0,Salon du Soleil has been my spa of choice for ...,"Apr 17, 2013",Monica Rennspies,1.366229e+09,79,
95,"1632 Upper James St Hamilton, ON L9B 1K4 Canada",False,110147958419380126442,"(43.201834, -79.89394)",Pura Brazilian Jiu-Jitsu Ltd,(905) 383-4255,,Martial Arts School,105380954374902265985,5.0,I started training on a whim with PJ when he h...,"Jul 31, 2012",David Ditta,1.343772e+09,56,
115,"319 High St Portsmouth, VA 23704",False,112239162167490629056,"(36.835208, -76.300037)",Cafe Europa,(757) 399-6652,$$,Restaurant or Cafe Mediterranean Restaurant,104416104280560735445,5.0,Evey time we go its wonderful,"Oct 22, 2012",john williams,1.350927e+09,2,
118,"800 Taunton Rd E Oshawa, ON L1H 7K4 Canada",False,113850456948212447900,"(43.926525, -78.909483)",International Pool & Spa Centers,(905) 434-7727,,Hot Tub Store Hot Tub Repair Service Swimming ...,107003859996616973790,2.0,After purchasing my Hot tub at the Oshawa loca...,"Dec 6, 2012",Shawn Berger,1.354824e+09,53,
211,"3470 Washington Dr Eagan, MN 55122",False,108732042045623308799,"(44.830553, -93.171667)",Margaret M Benson CPA,(651) 405-8337,,Bookkeeping Service Certified Public Accountan...,114509022952995388132,5.0,Peggy is absolutely outstanding. She went abov...,"Mar 6, 2013",Brad Schmitz,1.362596e+09,26,
218,"24 Shelter Cove Ln Hilton Head Island, SC 29928",False,115194802977785589258,"(32.184166, -80.722132)",The Mall at Shelter Cove,(843) 686-3090,,Shopping Mall,114346273978858932532,1.0,There is pretty much nothing in this whole mal...,"Aug 12, 2011",C. Kohler,1.313169e+09,16,
258,"105 U.S. 301 Tampa, FL 33619",False,101892898706322307023,"(27.947971, -82.354417)",Tire Mania Auto Repair,(813) 246-4050,,Air Conditioning Repair... Oil Change Service ...,103396539593572438038,5.0,"Over-all, my experience with Tire Mania Auto S...","May 18, 2013",Marvin Ortiz,1.368937e+09,25,
273,"777 Waterside Dr Norfolk, VA 23510",False,110761635090805824593,"(36.843694, -76.289392)",Sheraton Norfolk Waterside Hotel,(757) 622-6664,,Hotel Banquet Hall Meeting Planning Service,105636967478404940652,2.0,I highly recommend that you do not stay at the...,"Sep 8, 2012",Trina Parker,1.347115e+09,139,


In [174]:
final["rating"].value_counts()

5.0    4792
4.0    1816
3.0    1032
2.0     805
1.0     596
Name: rating, dtype: int64

In [185]:
pd.isnull("adfas")

False

In [195]:
def final_rating(row):
    if pd.isnull(row["translation"]):
        return row["reviewText"]
    return row["translation"][12:]

In [None]:
final["reviewText"]

In [196]:
final["reviews"] = final.apply(final_rating, axis = 1)

In [198]:
final = final.drop(["translation", "reviewText"], axis = 1)

In [200]:
final.head()

Unnamed: 0,address,closed,gPlusPlaceId,gps,name,phone,price,categories,gPlusUserId,rating,reviewTime,reviewerName,unixReviewTime,english score,reviews
9,Naunynstraße 60 10997 Berlin Germany,False,103592273547758141036,"(52.502139, 13.420115)",Trinkteufel,030 6147128,,Pub,107854905670749394534,5.0,"Sep 3, 2012",Alex Parasense,1346682000.0,5,Punky and Funky. Be careful if you are normal.
23,"6835 State Road 54 New Port Richey, FL 34653",False,112761994004427009119,"(28.217362, -82.700674)",New Port Richey Florist,(727) 849-2222,,,118059946268191865657,5.0,,Jennifer Shorter,,15,Joanne opened the shop during Memorial Day wee...
89,"3212 W 23rd St #2 Panama City, FL 32405",False,100322374440868401385,"(30.190242, -85.703636)",Salon du Soleil,(850) 215-6699,,Beauty Salon,102696293482965494050,5.0,"Apr 17, 2013",Monica Rennspies,1366229000.0,79,Salon du Soleil has been my spa of choice for ...
95,"1632 Upper James St Hamilton, ON L9B 1K4 Canada",False,110147958419380126442,"(43.201834, -79.89394)",Pura Brazilian Jiu-Jitsu Ltd,(905) 383-4255,,Martial Arts School,105380954374902265985,5.0,"Jul 31, 2012",David Ditta,1343772000.0,56,I started training on a whim with PJ when he h...
115,"319 High St Portsmouth, VA 23704",False,112239162167490629056,"(36.835208, -76.300037)",Cafe Europa,(757) 399-6652,$$,Restaurant or Cafe Mediterranean Restaurant,104416104280560735445,5.0,"Oct 22, 2012",john williams,1350927000.0,2,Evey time we go its wonderful


Working with nltk and sentiment analysis:

In [201]:
import nltk

In [257]:
stops = set(stopwords.words('english'))


In [261]:
all_terms = []
sentences = []

for row in final["reviews"]:
    sentences.append(row)
    all_terms += list([w.lower().strip(".") for w in row.split() if w not in stops])

Need to stem to get common terms:
http://www.nltk.org/howto/stem.html

In [264]:
from nltk.stem.porter import *

In [265]:
stem = PorterStemmer()

In [266]:
normed = [stem.stem(term) for term in all_terms]

In [262]:
from collections import Counter

In [274]:
common_terms = Counter(normed).most_common(500)
comm = [word[0] for word in common_terms]

In [278]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [374]:
vectorizer = TfidfVectorizer(max_features = 100)

In [375]:
X = vectorizer.fit_transform(sentences)

In [376]:
X[7232:]

<1809x100 sparse matrix of type '<class 'numpy.float64'>'
	with 10111 stored elements in Compressed Sparse Row format>

In [377]:
final = final.reset_index().drop("index", axis = 1)

In [378]:
int(len(final)/10)*8

7232

In [379]:
lens = int(len(final)/10)
train_y = final["rating"].iloc[:lens*8]
test_y = final["rating"].iloc[lens:(lens*8 + lens)]
valid_y = final["rating"].iloc[lens*8 + lens:]

train_x = vectorizer.fit_transform(final["reviews"].iloc[:lens*8])
test_x= vectorizer.fit_transform(final["reviews"].iloc[lens:(lens*8 + lens)])
valid_x = vectorizer.fit_transform(final["reviews"].iloc[lens*8 + lens:])




In [380]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [381]:
#X,Y = make_classification()
clf = RandomForestClassifier(random_state = 42)
clf.fit(train_x, train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [382]:
predictions = clf.predict(train_x)

In [383]:
results = pd.DataFrame({"actual": Y,"predictions": predictions})

In [384]:
sum(results["actual"] == results["predictions"])

6859

In [385]:
testing = clf.predict(test_x)

In [386]:
results = pd.DataFrame({"actual": test_y,"predictions": testing})

In [387]:
len(results)

7232

In [388]:
sum(results["actual"] == results["predictions"])

4857

# Training the Model:

In [389]:
from sklearn.model_selection import train_test_split


In [390]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorizer.fit_transform(final["reviews"]), final["rating"], test_size = 0.2, random_state = 42)


In [394]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier()

parameter_search = dict()
for i in range(100, 1000, 50):
    vectorizer = TfidfVectorizer(max_features = i)
    X = vectorizer.fit_transform(sentences)
    scores = cross_val_score(clf, vectorizer.fit_transform(final["reviews"]),final["rating"], cv = 3)
    parameter_search[i] = scores





In [395]:
parameter_search

{100: array([0.51127321, 0.51277796, 0.42330677]),
 150: array([0.51027851, 0.53136409, 0.44754316]),
 200: array([0.53017241, 0.51742449, 0.47144754]),
 250: array([0.53481432, 0.5303684 , 0.45683931]),
 300: array([0.52884615, 0.54098905, 0.44455511]),
 350: array([0.53580902, 0.53667441, 0.46547145]),
 400: array([0.53348806, 0.54729505, 0.45385126]),
 450: array([0.54244032, 0.53833389, 0.46646746]),
 500: array([0.53415119, 0.53601062, 0.45119522]),
 550: array([0.54509284, 0.53269167, 0.46513944]),
 600: array([0.53779841, 0.55061401, 0.50265604]),
 650: array([0.53149867, 0.54331231, 0.48804781]),
 700: array([0.54476127, 0.544308  , 0.50564409]),
 750: array([0.5454244 , 0.53169598, 0.47543161]),
 800: array([0.5464191 , 0.54132094, 0.45683931]),
 850: array([0.53846154, 0.55658812, 0.47642762]),
 900: array([0.54177719, 0.53269167, 0.45318725]),
 950: array([0.54177719, 0.54862264, 0.46281541])}

In [393]:
fives = final[final["rating"] == 5.0]
fours = final[final["rating"] == 4.0]
threes = final[final["rating"] == 3.0]
two = final[final["rating"] == 2.0]
one = final[final["rating"] == 1.0]

array([0.52260198, 0.53693495, 0.50828729, 0.51270718, 0.50607735,
       0.50774336, 0.51219512, 0.50997783, 0.46452328, 0.48669623])

In [None]:
common_terms = Counter(normed).most_common(500)
comm = [word[0] for word in common_terms]

In [None]:
fives.