# ***Language "processing"***

### ***Dataset :*** <u>***review_1819.csv***<u>

***Import necessary python modules***

In [1]:
# Dataframes
import pandas as pd
import numpy as np

# Language detection
import fasttext as ft
from pycountry import pycountry
from pycountry import languages

### ***1. Load Dataset***

In [3]:
# Load review_1819.csv into dataframe

df_lang = pd.read_csv('../data/review_1819.csv')

In [4]:
df_lang.head(1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,2018


### ***2. Setup model***

In [24]:
# New approach using fasttext

# Load pretrained model
fasttext_model = ft.load_model('../data/lid.176.bin')

# Initiate empty language list
language_list = []

accuracy = 0.95
for row in df_lang['text']:
    row = row.replace("\n"," ")                                     # replace \n with " "
    label = fasttext_model.predict(row, k=-1, threshold=accuracy)   # predict language per row with a certainty of at least 95%
    language_list.append(label)                                     # append result to list



In [6]:
df_lang

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,2018
1,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4.0,1,0,1,I was really between 3 and 4 stars for this on...,2018-07-17 03:30:07,2018
2,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5.0,0,0,0,My boyfriend and I tried this deli for the fir...,2018-08-23 21:39:38,2018
3,E9AB7V4z8xrt2uPF7T55FQ,iYY5Ii1LGpZCpXFkHlMefw,Zx7n8mdt8OzLRXVzolXNhQ,5.0,0,0,0,Amazing biscuits and (fill in the blank). Grea...,2018-04-27 23:03:21,2018
4,A4n4YaE-owOVgTQcrVqHUw,S7bjj-L07JuRr-tpX1UZLw,I6L0Zxi5Ww0zEWSAVgngeQ,4.0,0,0,0,The cafe was extremely cute. We came at 8am an...,2018-07-07 20:50:12,2018
...,...,...,...,...,...,...,...,...,...,...
1813641,34M6AEbY84174OBerbm96Q,i48cHEyRBl5g9_npYIG7dA,ReVpjIDupK_VMPn7ZxPvOQ,4.0,2,0,1,This place never fails the food is absolutely ...,2019-08-21 20:49:13,2019
1813642,_b_SOIkWHsNPRGW4RM3X5w,L_fdAnrH5Pxi4PqF08_ToA,ez4kMLP6OJEIaMbMrrGRdA,5.0,1,1,1,My sister and I went on the ghost tour with Th...,2018-12-10 16:31:32,2018
1813643,cACxcUY_AIsQKkpDRXuqnw,MCzlzlOw7IGbRAKVjJBPtg,fcGexL5VH5G2Xw0tRj9uOQ,3.0,1,1,0,This is a good pizza option - they deliver thr...,2018-03-13 13:54:48,2018
1813644,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30,2019


In [25]:
# Set language list as new column in dataframe

language_df = pd.DataFrame(language_list, columns=['language', 'probability'])
df_lang['language'] = language_df['language'].astype(str)
df_lang['language'] = df_lang['language'].str.replace('label',"").str.replace(r"[^a-zA-Z ]+","").str.strip();

  df_lang['language'] = df_lang['language'].str.replace('label',"").str.replace(r"[^a-zA-Z ]+","").str.strip();


In [27]:
df_lang.sample(20)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,language
1528615,JvHh2KiXZLR-18YPmG94RQ,vl2PwXe-62y-JLw6SpEseQ,3iUCCf1FWmjlFbGYvBgf9w,1.0,1,0,0,Ahahahaha this place is a joke if you get ther...,2018-12-31 19:27:40,2018,en
448792,MMTag_sWIGZKJ7Fpn1H5oQ,Qxrr4dZk5aBFcSzuVqfiSw,oIr50siZDtDYCE_p5K8nYA,5.0,1,0,1,The next time I'm in Philly I'm getting a hote...,2019-08-19 17:17:09,2019,en
634457,mk8cugW-koOfNX95udofPw,z7TYhEoI_XtpzxxdqZb2zQ,6NMo2AB5IPHeIJcTBjtejg,5.0,0,0,0,This is a great place. I highly recommend for ...,2019-10-31 14:14:55,2019,en
36237,qSz2DSgw_dHUJQ_4rhKKrg,vUSujt3eKEnED5_PyBh_xA,3OGzmGqWwsyGLkhnxrA9Pw,5.0,0,0,0,We ordered a pizza from here because I receive...,2019-01-24 23:53:52,2019,en
312045,My78jRd8dzcE76fYJyyxyQ,xGlFzk7P0iwhafS3ygPU2Q,rhPJ_wHCc0aah2EmbAVmPQ,5.0,0,0,1,I bought my first suit here when I was 12 year...,2018-06-27 21:02:14,2018,en
603361,vSCBtWVOF3UFJ12K-42XRA,qGCfXlsjWuwZoGwiogihSw,TuQKGufA3lWZovSq-abpRQ,5.0,0,0,0,Our go-to for great Mexican food. It's hard t...,2019-01-19 00:02:03,2019,en
264373,Wnz1mhSKe-IDkKg8CobhSw,lzx4AFxrByuVxgyZwRMYYg,Sv1MEZP-mMfp8SmE0hwYEA,5.0,1,0,0,"4* food, 5* service\n\nIn fact, I swear one of...",2019-11-23 21:15:43,2019,en
1216570,7ZY85KjYztHPgLAU5nZWEw,GDpshOJvnZKS8amtD2LamQ,FGqkXBqo-3olcRdsw4iWMQ,2.0,2,0,2,I took my first trial class here last Friday 4...,2019-04-14 14:15:42,2019,en
676708,zBFGLiDapfMatzX1gZLduw,6tJrZqiH6nkS5ptg4hsQrQ,O69OCM8Em9ZoJoV_GVhw-Q,4.0,1,0,1,The ladies and the gentleman here work really ...,2019-03-21 16:44:46,2019,en
280942,m_ta10ZlzgkbJT5nHOO_lg,-vOWDX1ZYtTEZF7_tU_MPg,lRPOodYgeRScvDDDO8_Qjg,5.0,0,0,0,Amazing food! I had the gluten free pizza and ...,2019-07-30 12:31:01,2019,en


In [28]:
df_lang.iloc[1624660]['text']

'Same horrible food as Cancun. Same owner?  Same "cooks?" Don\'t waste your money like I did.'

In [30]:
df_lang = df_lang[df_lang['language'] == 'en']

In [31]:
df_lang

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,language
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,2018,en
1,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4.0,1,0,1,I was really between 3 and 4 stars for this on...,2018-07-17 03:30:07,2018,en
2,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5.0,0,0,0,My boyfriend and I tried this deli for the fir...,2018-08-23 21:39:38,2018,en
4,A4n4YaE-owOVgTQcrVqHUw,S7bjj-L07JuRr-tpX1UZLw,I6L0Zxi5Ww0zEWSAVgngeQ,4.0,0,0,0,The cafe was extremely cute. We came at 8am an...,2018-07-07 20:50:12,2018,en
5,4KpIldEM-tdnrJLqYzRfZQ,Z5j9Xw_G0c7M2b1-iS67wg,HTqXI5S2XcSlh_ylx9sE6g,5.0,1,1,1,I've only had the cannolis here but they are a...,2018-03-23 14:35:33,2018,en
...,...,...,...,...,...,...,...,...,...,...,...
1813641,34M6AEbY84174OBerbm96Q,i48cHEyRBl5g9_npYIG7dA,ReVpjIDupK_VMPn7ZxPvOQ,4.0,2,0,1,This place never fails the food is absolutely ...,2019-08-21 20:49:13,2019,en
1813642,_b_SOIkWHsNPRGW4RM3X5w,L_fdAnrH5Pxi4PqF08_ToA,ez4kMLP6OJEIaMbMrrGRdA,5.0,1,1,1,My sister and I went on the ghost tour with Th...,2018-12-10 16:31:32,2018,en
1813643,cACxcUY_AIsQKkpDRXuqnw,MCzlzlOw7IGbRAKVjJBPtg,fcGexL5VH5G2Xw0tRj9uOQ,3.0,1,1,0,This is a good pizza option - they deliver thr...,2018-03-13 13:54:48,2018,en
1813644,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30,2019,en
