# Assignment 2

**Keyu Long, Man Jiang**

In [1]:
# Intel Speedup
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [15]:
# utils
import os
import gzip
from collections import defaultdict
from tqdm import tqdm as progress_bar
import re

In [3]:
# foundation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# data preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
# models
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
# evaluations

# Loading Data

### First clean the data

In [57]:
%%time 
# this cell organize user's position and decide the valid user we wish to use
user_position = defaultdict(tuple)
valid_user = set()
with gzip.open('users.clean.json.gz') as f:
    for l in progress_bar(f):
        temp = eval(l)
        if temp['currentPlace'] is not None and len(temp['currentPlace']) == 2:
            valid_user.add(temp['gPlusUserId'])
            
            position_a = temp['currentPlace'][1][1]
            position_b = temp['currentPlace'][1][2]
            user_position[temp['gPlusUserId']] = (position_a, position_b)

3747937it [02:15, 27709.15it/s]

CPU times: total: 2min 15s
Wall time: 2min 15s





In [58]:
%%time 
# this cell organize place's gps, place's name, and decide the valid place we wish to use
place_gps = defaultdict(tuple)
place_name = defaultdict(str)
valid_place = set()
with gzip.open('places.clean.json.gz') as f:
    for l in progress_bar(f):
        temp = eval(l)
        if temp['closed'] == False and temp['gps'] is not None:
            if temp['address'][-1] == 'United States' or bool(re.match('.*[A-Za-z]{2}\s\d{5}.*', temp['address'][-1])): 
                # only filter out the united states data
                valid_place.add(temp['gPlusPlaceId'])
            
                place_gps[temp['gPlusPlaceId']] = tuple(temp['gps'])
                place_name[temp['gPlusPlaceId']] = temp['name']

3114353it [02:21, 21984.74it/s]

CPU times: total: 2min 21s
Wall time: 2min 21s





In [59]:
len(valid_place)

1271442

In [60]:
# # get all the places names
# all_names = []
# for place in place_name:
#     all_names.append(place_name[place])
# # set up a tfidf vectorizer
# vectorizer = TfidfVectorizer()
# vectorizer.fit(all_names)
# del(all_names)

In [61]:
len(valid_user), len(valid_place)

(737639, 1271442)

In [62]:
%%time
# this cell record the reviews information (in the valid pairs)
X_full = []
y_full = []
with gzip.open('reviews.clean.json.gz') as f:
    for l in progress_bar(f):
        temp = eval(l)
        user = temp['gPlusUserId']
        place = temp['gPlusPlaceId']
        rating = temp['rating']
        review = temp['reviewText']
        if user in valid_user and place in valid_place:
            if review is not None:
                X_full.append((user, place, review))
                y_full.append(rating)

11453845it [04:58, 38334.30it/s]

CPU times: total: 4min 58s
Wall time: 4min 58s





In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.1, random_state=42)

In [64]:
# get the train data average ratings for user & place
ratings_per_user = defaultdict(list)
ratings_per_place = defaultdict(list)
overall_ratings = []

for (user, place, review), rating in zip(X_train, y_train):
    ratings_per_user[user].append(rating)
    ratings_per_place[user].append(rating)
    overall_ratings.append(rating)
    
average_rating = np.mean(overall_ratings)

In [14]:
vectorizer.transform([place_name['100062881646354125752']])

<1x832321 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [65]:
def feature(user, place, review, ratings_per_user=ratings_per_user, ratings_per_place=ratings_per_place):
    """
    This method that returns the features.
    
    input:
        user_id
        place_id
    output:
        user_position_a, 
        user_position_b, 
        average_user_ratings
        place_position_a, 
        place_position_b, 
        average_place_ratings,
        place_name
    """
    output = [review]
    
    # get the user positions
    user_position_a, user_position_b = user_position[user]
    output.append(user_position_a)
    output.append(user_position_b)
    
    # get the average user ratings
    if user in ratings_per_user:
        output.append(np.mean(ratings_per_user[user]))
    else:
        output.append(average_rating)
        
    # get the place positions
    place_position_a, place_position_b = place_gps[place]
    output.append(place_position_a)
    output.append(place_position_b)
    
    # get the average place ratings
    if place in ratings_per_place:
        output.append(np.mean(ratings_per_place[place]))
    else:
        output.append(average_rating)
        
#     # append the place_name
#     name = place_name[place]
#     output.append(name)
    
    return output

In [66]:
%%time
X_train_final = []
for user, place, review in progress_bar(X_train):
    X_train_final.append(feature(user, place, review))

100%|██████████████████████████████████████████████████████████████████████| 768825/768825 [00:07<00:00, 101889.79it/s]

CPU times: total: 7.66 s
Wall time: 7.68 s





In [67]:
%%time
X_test_final = []
for user, place, review in progress_bar(X_test):
    X_test_final.append(feature(user, place, review))

100%|████████████████████████████████████████████████████████████████████████| 85426/85426 [00:00<00:00, 137267.23it/s]

CPU times: total: 641 ms
Wall time: 643 ms





In [68]:
# organize the data into pds and append the column name
column_names = ['review_text','user_pa', 'user_pb', 'user_avg', 'place_pa', 'place_pb', 'place_avg']
X_train_final_df = pd.DataFrame(X_train_final, columns=column_names)
y_train = np.array(y_train)
X_test_final_df = pd.DataFrame(X_test_final, columns=column_names)
y_test = np.array(y_test)

In [69]:
X_train_final_df

Unnamed: 0,review_text,user_pa,user_pb,user_avg,place_pa,place_pb,place_avg
0,Commun. Sans plus.,-210538750,552286830,3.500000,37.878316,-122.269447,4.139453
1,Decent food.,476062100,-1223320710,3.000000,47.624097,-122.312474,4.139453
2,Food and prices are amazing however the place ...,404167020,-868752870,4.600000,40.419059,-86.888125,4.139453
3,Teddy's serves up great big burgers topped wit...,360331160,-867827770,3.947368,21.271764,-157.821682,4.139453
4,My wife and I are very happy with this store. ...,449799650,-932638360,4.333333,44.937585,-93.290963,4.139453
...,...,...,...,...,...,...,...
768820,I was here when it opened! The Goliath burrit...,302671530,-977430610,4.226667,30.245371,-97.757424,4.139453
768821,Good movies at a lower price.,423726400,-711096530,3.731707,42.396705,-71.122862,4.139453
768822,Very friendly service. The recipient of the f...,296516340,-823248260,5.000000,27.967926,-82.767256,4.139453
768823,I love this place for breakfast. I had the Fre...,279494360,-824651440,4.789474,40.745383,-74.002402,4.139453


In [70]:
# organize a pipeline
preproc = ColumnTransformer(
    transformers = [
        ('std', StandardScaler(), ['user_pa', 'user_pb', 'user_avg', 'place_pa', 'place_pb', 'place_avg']),
        ('tfidf', TfidfVectorizer(decode_error='ignore'), 'review_text')
    ]
)
pl = Pipeline([
    ('preprocessor', preproc),
    ('log_clf', LogisticRegression(n_jobs=-1))
])

In [71]:
pl.fit(X_train_final_df, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('std', StandardScaler(),
                                                  ['user_pa', 'user_pb',
                                                   'user_avg', 'place_pa',
                                                   'place_pb', 'place_avg']),
                                                 ('tfidf',
                                                  TfidfVectorizer(decode_error='ignore'),
                                                  'review_text')])),
                ('log_clf', LogisticRegression(n_jobs=1))])

In [72]:
pl.score(X_train_final_df, y_train)

0.7036594803758983

In [73]:
pl.score(X_test_final_df, y_test)

0.625711141806944

In [18]:
len(X_train)

2137548

In [65]:
vectorizer.transform([place_name['100062881646354125752']])

<1x832321 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [100]:
data[8]

{'name': '天山湯治郷',
 'price': None,
 'address': ['208 Yumotochaya',
  'Hakone, Ashigarashimo District, Kanagawa 250-0312',
  'Japan'],
 'hours': None,
 'phone': '0460-86-4126',
 'closed': False,
 'gPlusPlaceId': '100062881646354125752',
 'gps': [35.224978, 139.088382]}

In [None]:
%%a

In [82]:
len(data)

737639

<function defaultdict.keys>

In [86]:
list(data.keys)

TypeError: 'builtin_function_or_method' object is not iterable

In [64]:
data = []
with gzip.open('users.clean.json.gz') as f:
    i = 0
    for l in f:
        temp = eval(l)
        if temp['currentPlace'] is not None:
            data.append(temp)
        i += 1
        if i >= 100:
            break

In [73]:
data[3]['currentPlace'][1][1]

418954660

In [33]:
data[6]

{'userName': 'Bharti Phand',
 'jobs': None,
 'currentPlace': None,
 'previousPlaces': None,
 'education': [[[], [], [], [], [], 6], []],
 'gPlusUserId': '100000036174088924566'}

In [57]:
data[29]

{'userName': 'William Corcuera',
 'jobs': None,
 'currentPlace': None,
 'previousPlaces': None,
 'education': [[[], [], [], [], [], 6], []],
 'gPlusUserId': '100000122158721897485'}

In [44]:
data[16]

{'userName': 'Aniello Prezioso',
 'jobs': [['Stato Maggiore Difesa',
   'LGT',
   [[1, 1, 1979], [1, 1, 2012], 1],
   '',
   '']],
 'currentPlace': ['roma', [[], 418954660, 124823240, 1]],
 'previousPlaces': [['salerno', [[], 406779570, 147659120, 1]]],
 'education': [[[], [], [], [], [], 6],
  [['giacinto vicinanza a salerno', 'elementari', [[], [1, 1, 1965]], '', ''],
   ['giacinto vicinanza', '', [[], [1, 1, 1970]], '', '']]],
 'gPlusUserId': '100000067656171981860'}

In [27]:
data[0]['jobs']

[['Tổng công ty IDICO',
  'Chuyên viên Kỹ thuật XD',
  [[1, 1, 1998], [1, 1, 2013], 1],
  '',
  '']]