In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
from tensorflow.keras import layers

In [None]:
def create_end_to_end_model(weights_path = 'xtreme_lite_weights_text.h5'):
  max_features = 20000
  embedding_dim = 128
  sequence_length = 1000
  def create_model():
    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embedding_dim)(inputs)

    # # Conv1D + global max pooling
    x = layers.Conv1D(128, 7, padding="same", activation="relu", strides=3)(x)
    x = layers.GlobalMaxPooling1D()(x)

    # # # We add a vanilla hidden layer:
    # x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

  def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

  def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(
        lowercase, "[%s]" % re.escape(string.punctuation), "")
  
  model = create_model()
  model.load_weights(weights_path)
  vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,)
  inputs = tf.keras.Input(shape=(1,), dtype="string")
  # Turn strings into vocab indices
  indices = vectorize_layer(inputs)
  # Turn vocab indices into predictions
  outputs = model(indices)

  # Our end to end model
  end_to_end_model = tf.keras.Model(inputs, outputs)
  end_to_end_model.compile(
      loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
  )
  return end_to_end_model

In [None]:
final_model = create_end_to_end_model()
final_model.predict(np.array(['Limited edition Faderade tanks for the kickstarter community only!', 
                                   'Every one is a winner!']))

OSError: ignored

In [None]:
from google.colab import drive
drive.mount("/content/drive") 
path = 'drive/My Drive/Lambda/kickstarter/kickstarter.csv'
df = pd.read_csv(path, index_col=0)
df['binary_state'] = df['state'].apply(lambda state: 0 if state == 'failed' else 1)
df.drop(columns = ['state'], inplace=True)
print(str(df.blurb[[1]]))
df.head(2)

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from sklearn.metrics import classification_report
arr = np.array(df.blurb)
y_pred = final_model.predict(arr[:int(len(arr) * 0.6)])

In [None]:
np.mean(y_pred)

In [None]:
np.max(y_pred)

In [None]:
np.min(y_pred)

In [None]:
MIN = np.min(y_pred)
MAX = np.max(y_pred)

In [None]:
c = 

In [None]:
b = (y_pred - MIN)/(MAX - MIN)
print(np.max(b))
print(np.min(b))
print(np.mean(b))

In [None]:
np.median(y_pred)

In [None]:
np.median(b)

In [None]:
med = np.median(y_pred)

In [None]:
np.median(y_pred)

In [None]:
final = [1 if value > med else 0 for value in y_pred]

In [None]:
set(final)

In [None]:
sample = np.array(df.binary_state[:int(len(arr) * 0.6)])

In [None]:
classification_report(sample, final, output_dict=True)

In [None]:
accuracy_score(sample, final)

In [None]:
b = (y_pred - np.min(y_pred))/(np.max(y_pred) - np.min(y_pred))

In [None]:
df.blurb[:5]

In [None]:
df.binary_state[:5]

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
pipe = Pipeline([
                 ('cv', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('nn', NearestNeighbors()),           
])
pipe.fit(df.blurb, df.binary_state)

In [None]:
x_t = pipe.named_steps['cv'].transform([df.blurb[0]])
x_t_t = pipe.named_steps['tfidf'].transform(x_t)
neighbors = pipe.named_steps['nn'].kneighbors(x_t_t, return_distance = False)[0]

In [None]:
for row in df.blurb[neighbors]:
  print(row)

In [None]:
df.blurb[0]