In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def get_df(fn, limit=None):
    json_lines = []
    line_nr = 1
    with open(fn) as f:
        for line in f:
            if limit and line_nr > limit:
                break
            json_line = json.loads(line)
            json_lines.append(json_line)
            line_nr += 1
    df = pd.DataFrame(json_lines)
    return df

In [None]:
df_review = get_df(DATA_PATH_PREFIX + 'review.json', 1000000)
df_review

In [None]:
df_review['review_len'] = df_review['text'].apply(len)

In [None]:
df_review['exclamation'] = df_review['text'].apply(lambda x: x.count('!'))

In [None]:
def count_upper(text):
    count = 0
    for ch in text:
        if ch.isupper():
            count += 1
    return count

df_review['upper'] = df_review['text'].apply(lambda x: count_upper(x))

In [None]:
df_review.eval('rel_upper = upper / review_len', inplace=True)

In [None]:
df_review

In [None]:
# plt.scatter(x=df_review.stars, y=df_review.review_len)
sns.pairplot(data=df_review[['stars', 'review_len']])

In [None]:
df_review.groupby('stars').mean()

In [None]:
sns.heatmap(df_review.corr())

In [None]:
df_review.describe()

In [None]:
df_review.stars.unique()

In [None]:
df_review.info()

In [None]:
df_business = get_df(DATA_PATH_PREFIX + 'business.json')
df_business

In [None]:
df_business.describe()

In [None]:
df_business.stars.unique()

In [None]:
df_business.info()

In [None]:
df_business.state.unique()

In [None]:
cat = []
for item in df_business.categories:
    if item:
        for item_item in item.split(','):
            cat.append(item_item.strip())

In [None]:
cat_set = set(cat)

In [None]:
len(cat_set)

In [None]:
cat_set

In [None]:
cat_first = []
for item in df_business.categories:
    if item:
        cat_first.append(item.split(',')[0])

In [None]:
cat_first_set = set(cat_first)

In [None]:
len(cat_first_set)

In [None]:
df_business.groupby('is_open').count().business_id

In [None]:
df_tip = get_df(DATA_PATH_PREFIX + 'tip.json')
df_tip

In [None]:
df_tip.describe()

In [None]:
df_tip.query('compliment_count > 0')

In [None]:
sorted(df_tip.date, reverse=True)

In [None]:
df_user = get_df(DATA_PATH_PREFIX + 'user.json', 1000)
df_user

In [None]:
df_user.columns

In [None]:
df_checkin = get_df(DATA_PATH_PREFIX + 'checkin.json')
df_checkin

In [None]:
df_checkin.iloc[0,1]

In [None]:
sns.histplot(df_review.stars)

In [None]:
sns.histplot(df_review.rel_upper)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# X = np.array(df_review.review_len).reshape(-1, 1)
# X = np.array(df_review.review_len).reshape(-1, 1)
# X = df_review[['review_len', 'rel_upper', 'exclamation']]
# X = df_review[['rel_upper', 'exclamation']]
X = df_review.text
y = df_review.stars

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [None]:
# logreg = LogisticRegression()
# linreg = LinearRegression()
# tree = DecisionTreeClassifier()
multinb = MultinomialNB()


In [None]:
# logreg.fit(X_train, y_train)
# linreg.fit(X_train, y_train)
# tree.fit(X_train, y_train)
multinb.fit(X_train_vect, y_train)

In [None]:
# y_pred = logreg.predict(X_test)
# y_pred = linreg.predict(X_test)
# y_pred = np.round(y_pred)
# y_pred = tree.predict(X_test)
y_pred = multinb.predict(X_test_vect)

In [None]:
set(y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred))

In [None]:
from matplotlib.colors import LogNorm

sns.heatmap(confusion_matrix(y_test, y_pred), norm=LogNorm())