In [17]:
import numpy as np
from matplotlib import pylab as plt
%autosave 0
%matplotlib inline

Autosave disabled


In [18]:
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf 
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [19]:
import pandas as pd
import gzip
import json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [20]:
# feature selection
def select_features(df, max_features):
    X = df['reviewText']
    y = df['overall']

    features = np.array(X.tolist()[:max_features])
    scores = np.array(y.tolist()[:max_features])
    
    # # shuffle the datasets in the same order
    # p = np.random.permutation(max_features)
    # features = features[p]
    # scores = scores[p]

    final_labels = []
    
    # generate labels
    for i in range(len(features)):
        if scores[i] >= 3.5:
            final_labels.append(1)
        else:
            final_labels.append(0)
    final_labels = np.array(final_labels)
    return (features, final_labels)


In [21]:
def create_bow_model(features, max_features):
    # creates an object for each item inside of a given tensor then adds them all into a list
    text_dataset = tf.data.Dataset.from_tensor_slices(features)

    max_features = max_features  # Maximum vocab size.
    max_len = 4  # Sequence length to pad the outputs to.
    embedding_dims = 2

    # # Create the vectorization layer (layer for generating the bag of words) 
    vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='count')


    # # Now that the vocab layer has been created, call `adapt` on the text-only  
    # # dataset to create the vocabulary.  
    vectorize_layer.adapt(text_dataset.batch(64))
    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)

    return model

In [22]:
#generate bag of words from selected features
def generate_bow(features, max_features, model):
    input_data = features
    input_data.shape = (max_features, 1)
    features_bow = model.predict(input_data)
    return features_bow

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

In [24]:
def run_random_forest(features, labels):
    classifier = RandomForestClassifier(n_estimators=10)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

    classifier.fit(X_train, y_train)

    y_pred = classifier.predict_proba(X_test)
    #calculate f1 scores
    y_pred=y_pred[:,1]>=0.3
    y_pred_int = y_pred.astype(np.int)
    score = f1_score(y_test, y_pred_int)

    return score

In [25]:
def run_decision_tree(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.33, random_state=42)

    nc = DecisionTreeClassifier(random_state = 2)
    nc.fit(X_train,y_train)

    y_p = nc.predict(X_test)
    return np.mean(y_p)


In [26]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Convolution1D, GlobalMaxPooling1D

In [27]:
def build_nn_model():
    model = Sequential()
    #he taught us in class about relu it is more commonly used
    #sigmoid is used for the output layer
    #dense is basically adding layers to yout model for better input and output
    model.add(Dense(12, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))

    #binary_cross used for binary classificaiton problems
    # adam = for gradient descent 
    #metric because this is a classificaiton problem 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [28]:
def run_ffnn(features, labels, model):
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.33, random_state=42)

    model.fit(X_train, y_train , epochs = 10 , batch_size = 10)
    loss, accuracy = model.evaluate(X_test, y_test)
    #the lower the loss the closer our predicitons are to the  labels 
    print(loss, accuracy)

In [29]:
df = getDF('Video_Games_5.json.gz')
max_features = 10000
features, labels = select_features(df, max_features)

# generate a vocabulary (bag of words) for a given set of text features 
bow_model = create_bow_model(features, max_features)
features_bow = generate_bow(features, max_features, bow_model)

In [30]:
random_forest_score = run_random_forest(features_bow, labels)
print(random_forest_score)

0.917952084017066


In [31]:
decision_tree_score = run_decision_tree(features_bow, labels)
print(decision_tree_score)

0.8521212121212122


In [32]:
nn_model = build_nn_model()
run_ffnn(features_bow, labels, nn_model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6846424341201782 0.8690909147262573
