In [1]:
import sys
import json
import csv
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from collections import Counter
from sklearn.base import TransformerMixin

In [2]:
businessjson = 'dataset/business.json'
checkinjson = 'dataset/checkin.json'

# Load Business data
business_json_lines = [json.loads( l.strip() ) for l in open(businessjson).readlines() ]
df = json_normalize(business_json_lines)


In [3]:
# Get records from Yelp set for only restaurants
df = df[df['categories'].astype(str).str.lower().str.contains('restaurant') == True]
df = df.reset_index(drop=True)

#Remove Columns of least significance
df = df.drop(['address','attributes.AcceptsInsurance','attributes.AgesAllowed','attributes.BYOB','attributes.ByAppointmentOnly','attributes.Corkage','attributes.DietaryRestrictions.dairy-free','attributes.DietaryRestrictions.gluten-free','attributes.DietaryRestrictions.halal','attributes.DietaryRestrictions.kosher','attributes.DietaryRestrictions.soy-free','attributes.DietaryRestrictions.vegan','attributes.DietaryRestrictions.vegetarian','attributes.HairSpecializesIn.africanamerican','attributes.HairSpecializesIn.asian','attributes.HairSpecializesIn.coloring','attributes.HairSpecializesIn.curly','attributes.HairSpecializesIn.extensions','attributes.HairSpecializesIn.kids','attributes.HairSpecializesIn.perms','attributes.HairSpecializesIn.straightperms','attributes.Open24Hours','attributes.RestaurantsCounterService','business_id','categories','hours.Friday','hours.Monday','hours.Saturday','hours.Sunday','hours.Thursday','hours.Tuesday','hours.Wednesday','latitude','longitude','name','neighborhood','state'],axis=1)

# Classify the records
df['target'] = [1 if df['stars'][x] >= 3.5 and df['review_count'][x] >= 30 else 0 for x in range(len(df))]
df.to_csv('business_csv_filter.csv', encoding='utf-8', index=False)
print df['target'].value_counts()

0    37140
1    14485
Name: target, dtype: int64


In [17]:
# for filling the missing values
from sklearn.base import TransformerMixin

class SeriesImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  

        """
    def fit(self, X, y=None):
        if   X.dtype == np.dtype('O'): self.fill = X.value_counts().index[0]
        else                            : self.fill = X.mean()
        return self

    def transform(self, X, y=None):
       return X.fillna(self.fill)


In [18]:
print Counter(df['attributes.Alcohol'])
a  = SeriesImputer()   # Initialize the imputer
a.fit(df['attributes.Alcohol'])              # Fit the imputer
df['attributes.Alcohol'] = a.transform(df['attributes.Alcohol'])
print Counter(df['attributes.Alcohol'])

Counter({u'none': 29706, u'full_bar': 15853, u'beer_and_wine': 6066})


In [19]:
print Counter(df['attributes.Ambience.casual'])
print df.groupby(['attributes.Ambience.casual','target']).size()


Counter({False: 21694, True: 18166, nan: 11765})
attributes.Ambience.casual  target
False                       0         17820
                            1          3874
True                        0          7912
                            1         10254
dtype: int64


In [18]:
# Seggregting data to positive and negative buckets.
#print df.loc[0]
pos_bucket = df.loc[df['target'] == 1].reset_index(drop=True)
neg_bucket = df.loc[df['target'] == 0].reset_index(drop=True)


37140
0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
37110    0
37111    0
37112    0
37113    0
37114    0
37115    0
37116    0
37117    0
37118    0
37119    0
37120    0
37121    0
37122    0
37123    0
37124    0
37125    0
37126    0
37127    0
37128    0
37129    0
37130    0
37131    0
37132    0
37133    0
37134    0
37135    0
37136    0
37137    0
37138    0
37139    0
Name: target, Length: 37140, dtype: int64


In [25]:
print Counter(pos_bucket['attributes.Alcohol'])
print pos_bucket.groupby(['attributes.Alcohol','target']).size()

pos_bucket.loc[pos_bucket['attributes.Alcohol'].isnull(),'attributes.Alcohol'] = 'full_bar'

print len(pos_bucket.loc[pos_bucket['attributes.Alcohol'].isnull(),'attributes.Alcohol'])

print pos_bucket['attributes.Alcohol']
print Counter(pos_bucket['attributes.Alcohol']),"$"
print pos_bucket.groupby(['attributes.Alcohol','target']).size(),"^"


Counter({u'full_bar': 6728, u'none': 5096, u'beer_and_wine': 2661})
attributes.Alcohol  target
beer_and_wine       1         2661
full_bar            1         6728
none                1         5096
dtype: int64
0
0             full_bar
1             full_bar
2        beer_and_wine
3                 none
4             full_bar
5             full_bar
6        beer_and_wine
7        beer_and_wine
8             full_bar
9                 none
10                none
11            full_bar
12            full_bar
13            full_bar
14                none
15            full_bar
16       beer_and_wine
17            full_bar
18                none
19            full_bar
20                none
21                none
22            full_bar
23            full_bar
24                none
25            full_bar
26       beer_and_wine
27            full_bar
28            full_bar
29            full_bar
             ...      
14455         full_bar
14456    beer_and_wine
14457             none
144