In [93]:
import numpy as np
from matplotlib import pylab as plt
%autosave 0
%matplotlib inline

Autosave disabled


In [94]:
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf 
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [95]:
import pandas as pd
import gzip
import json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('./Video_Games_5.json.gz')



In [96]:
# feature selection

X = df['reviewText']
y = df['overall']

features = X.tolist()[:20000]
scores = y.tolist()[:20000]

final_labels = []


for i in range(len(features)):
    if scores[i] >= 3.5:
        final_labels.append(1)
    else:
        final_labels.append(0)

Vectorize all of the reviews using bag of words
- Keras vectorization implementation: https://keras.io/api/layers/preprocessing_layers/core_preprocessing_layers/text_vectorization/
- Overview of algorithm by Google Cloud Platform: https://www.youtube.com/watch?v=UFtXy0KRxVI

In [97]:
# Shuffle the original features array for building the vocabulary
vocab_set = np.array(X.tolist()[20000:40000])
np.random.shuffle(vocab_set)

# creates an object for each item inside of a given tensor, then adds them all into a list
text_dataset = tf.data.Dataset.from_tensor_slices(vocab_set)

max_features = 20000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.
embedding_dims = 2

# # Create the layer.  
vectorize_layer = TextVectorization(
max_tokens=max_features,
output_mode='count')


# # Now that the vocab layer has been created, call `adapt` on the text-only  
# # dataset to create the vocabulary. You don't have to batch, but for large  
# # datasets this means we're not keeping spare copies of the dataset.  
vectorize_layer.adapt(text_dataset.batch(64))
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

In [98]:
input_data = np.array([])
stop = 0
for item in text_dataset:
    if stop == max_features:
        break
    input_data = np.append(input_data, np.array(item))
    stop += 1

features = model.predict(input_data)
features.shape


(20000, 20000)

Features obtained from the predict function above will be used with the classifiers

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

final_labels = np.array(final_labels)

print(features.shape)
print(final_labels.shape)

classifier = RandomForestClassifier(n_estimators=10)


X_train, X_test, y_train, y_test = train_test_split(features, final_labels, test_size=0.33, random_state=42)

classifier.fit(X_train, y_train)

y_pred = classifier.predict_proba(X_test)
#calculate f1 scores

# if probability that 
y_pred=y_pred[:,1]>=0.3
y_pred_int = y_pred.astype(np.int)
score = f1_score(y_test, y_pred_int)
score


(20000, 20000)
(20000,)


0.9065606361829026