# Firebase Testing with Python

This will be the code that we host on PythonAnywhere hopefully to communicate with the Firebase servers about various things, namely fetching CSV's to be used for analysis.

Please run the imports only once, there are issues with reinitializing the firebase app.

In [1]:
import firebase_admin
from firebase_admin import credentials, firestore
from firebase_admin import storage

if not firebase_admin._apps:
    cred = credentials.Certificate('mlforall-admin-sdk.json')
    firebase_admin.initialize_app(cred, {
        'storageBucket': 'mlforall-14bf7.appspot.com'
    })
bucket = storage.bucket()

# 'bucket' is an object defined in the google-cloud-storage Python library.
# See https://googlecloudplatform.github.io/google-cloud-python/latest/storage/buckets.html
# for more details.

## Reading CSV From Storage

Some stats stuff that we'll need

In [2]:
import pandas as pd
import numpy as np
import pickle
from io import StringIO

Modified from GCS Documentation, How to Download Files https://cloud.google.com/storage/docs/downloading-objects#code-samples

In [3]:
def get_blob(bucket, source_blob_name):
    
    blob = bucket.blob(source_blob_name)
    csv_bytes = blob.download_as_string()
    s = str(csv_bytes,'utf-8')
    data = StringIO(s) 

    return pd.read_csv(data)

My Tests

In [4]:
def make_path(uid, project_title, file_name):
    return uid + "/" + project_title + "/" + file_name

In [7]:
uid = "UDjMojFqWHOdW0fCIJPMNPScQ9p1"
project_title = "Spotify"
file_name = "simple_top50.csv"
# project_title = "Pokemon"
# file_name = "Pokemon.csv"

In [8]:
df = get_blob(bucket, make_path(uid, project_title, file_name))
df.head()

Unnamed: 0,id,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
0,1,SeÒorita,Shawn Mendes,foreign pop,117,55,76,-6,8,75,191,4,3,79
1,2,China,Anuel AA,reggaeton,105,81,79,-4,8,61,302,8,9,92
2,3,boyfriend (with Social House),Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
3,4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
4,5,Goodbyes (Feat. Young Thug),Post Malone,rap,150,65,58,-4,11,18,175,45,7,94


## Slider Stats

We'll need to be able to update our variables field in firestore to be able to make some juicy nice sliders.

In [9]:
from firebase_admin import firestore #already did this but just to emphasize

db = firestore.client()

In [11]:
# proj_id = u'Ur5zFvWTPh6bnnAVjcEf' # unique ID for Pokemon
# proj_id = 'e4PkqmpN7UdbV0ms1pDH' # "delete later" project
proj_id = 'JvaRJzIQ133wbIY3Wa1P' # unique ID for Spotify
project_ref = db.collection("projects").document(proj_id)
doc = project_ref.get()
if doc.exists:
    print("Document data: {}".format(doc.to_dict()))
else:
    print("No such document!")
doc = doc.to_dict()

Document data: {'title': 'Spotify', 'content': 'This will be my first classification project about Spotify. Project titles will serve both as directory names and titles.', 'models': ['log_reg', 'knn'], 'authorLastName': 'Huang', 'createdAt': DatetimeWithNanoseconds(2020, 5, 15, 22, 5, 35, 75000, tzinfo=<UTC>), 'authorFirstName': 'Len', 'variables': [{'lo': 85.0, 'hi': 190.0, 'name': 'Beats.Per.Minute', 'q1': 96.0, 'q2': 104.5, 'q3': 137.5}, {'hi': 88.0, 'name': 'Energy', 'q1': 55.25, 'q2': 66.5, 'q3': 74.75, 'lo': 32.0}, {'name': 'Popularity', 'q1': 86.0, 'q2': 88.0, 'q3': 90.75, 'lo': 70.0, 'hi': 95.0}], 'csvName': 'simple_top50.csv', 'authorID': 'UDjMojFqWHOdW0fCIJPMNPScQ9p1'}


In [12]:
def getInformation(df, input_variable):
    ref = df.describe()[input_variable]
    info = {
        "name" : input_variable,
        "lo" : ref[3],
        "hi" : ref[7],
        "q1" : ref[4],
        "q2" : ref[5],
        "q3" : ref[6]
    }
    return info

def getVariables(df, input_list):
    variables = []
    for inp in input_list:
        variables.append(getInformation(df, inp))
    return variables

In [13]:
# project_ref.update({"variables" : getVariables(df, ["HP","Attack","Defense","Speed"])})
getVariables(df, ["Popularity"])

[{'name': 'Popularity',
  'lo': 70.0,
  'hi': 95.0,
  'q1': 86.0,
  'q2': 88.0,
  'q3': 90.75}]

In [14]:
df.describe()["Popularity"]

count    50.000000
mean     87.500000
std       4.491489
min      70.000000
25%      86.000000
50%      88.000000
75%      90.750000
max      95.000000
Name: Popularity, dtype: float64

## Uploading Pickles to Storage

First let's make some models

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [8]:
def build_logistic_regression(X, y):
    logreg = LogisticRegression()
    logreg.fit(X, y)
    return logreg

def build_and_pickle(df, target_parameter, df_variables, pickle_name, debug=False):
    target = df[target_parameter]
    col_name_list = list(df.columns)
    
    for col in df_variables:
        if col in col_name_list:
            col_name_list.remove(col)
            
    df.drop(col_name_list, axis=1, inplace=True)
    # now target contains the labels, and df contains the variables
    X = df
    y = target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # models
    logreg = build_logistic_regression(X_train, y_train)
    
    return pickle.dumps(logreg)

def update_firestore()

Some testing info

In [9]:
# target = 'Genre'
# variables = ['Beats.Per.Minute', 'Energy', 'Danceability','Loudness..dB..', 'Liveness', 'Valence.', 'Length.']

target = 'Type 1'
variables = ['HP', 'Attack', 'Defense','Speed']

pickle_name = "log_reg"
path = make_path(uid, project_title, pickle_name)
pickle_bytes = build_and_pickle(df, target, variables, project_title, debug=True)

## Pickle Functions

In [10]:
#from joblib import dump, load

def p_send_blob(bucket, pickle_bytes, pickle_path):
    
    blob = bucket.blob(pickle_path)
    blob.upload_from_string(pickle_bytes)

def p_get_blob(bucket, pickle_path):
        
    blob = bucket.blob(pickle_path)
    pickle_bytes = blob.download_as_string()

    return pickle.loads(pickle_bytes)

Send our data to the storage

In [11]:
p_send_blob(bucket, pickle_bytes, path)

Get the data back

In [12]:
def floatCast(num):
    return float(num)

def load_and_predict(model, prediction_variables):
    X_predict = [list(map(floatCast, prediction_variables))]
    guess = model.predict(X_predict)
    return guess[0]

In [13]:
model = p_get_blob(bucket,path)
# X_predict = [120,64,70,-5,14,54,200] # spotify
X_predict = [40, 10, 20, 80] # pokemon

load_and_predict(model, X_predict)

'Psychic'

# Kaggle Downloads

In [49]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()



Code taken / modified from https://github.com/Kaggle/kaggle-api/blob/master/kaggle/api/kaggle_api_extended.py and https://github.com/Kaggle/kaggle-api/blob/master/kaggle/api/kaggle_api.py and https://github.com/Kaggle/kaggle-api/blob/3ce5046d42d55b951ad301053e002930cead1cbc/kaggle/api/kaggle_api.py#L1506

In [2]:
def getSlugs(dataset):
    # dataset: string informat [owner]/[dataset-name]
    if '/' in dataset:
        dataset_urls = dataset.split('/')
        owner_slug = dataset_urls[0]
        dataset_slug = dataset_urls[1]
        return owner_slug, dataset_slug
    else:
        return None

In [3]:
owner, dataset = getSlugs("avenn98/world-of-warcraft-demographics")
# thread = api.datasets_download(owner, dataset)
#result = thread.get()
#downloads locally
#thread = api.dataset_download_files('avenn98/world-of-warcraft-demographics')
#response = thread.get()

# Dataworld Downloads

In [100]:
import datadotworld as dw

# helpful to use dir() !

def loadDf(path):
    dfs = dw.load_dataset(path).dataframes._dict.values()
    return list(dfs)[0]._loader_func()

def getPath(url):
    return url.replace("https://data.world/","")

def getDf(url):
    return loadDf(getPath(url))

### Example paths:
(From https://data.world/datasets/open-data)
- jonloyens/an-intro-to-dataworld-dataset
- https://data.world/dcopendata/swimming-pools
- https://data.world/makeovermonday/2019w51

In [101]:
df = getDf("https://data.world/makeovermonday/2019w51")
df.head()

Unnamed: 0,season,rank,team,g,w,l,def_rtg,rtg_vs_league_avg
0,1996-97,1.0,Miami Heat,82.0,61.0,21.0,99.2,0.944948
1,1996-97,2.0,New York Knicks,82.0,57.0,25.0,99.5,0.947806
2,1996-97,3.0,Atlanta Hawks,82.0,56.0,26.0,100.3,0.955426
3,1996-97,4.0,Chicago Bulls,82.0,69.0,13.0,100.7,0.959237
4,1996-97,5.0,Cleveland Cavaliers,82.0,42.0,40.0,100.8,0.960189
