# Data Setup Script

This script is meant to create the initial "data sources" for the DS 2002 midterm. Because this project requires data to be pulled from different sources that don't yet exist, this script generates json files that can be served either from the filesystem or api endpoints, as well as be uploaded to mongodb

### Libraires
We first have to import the required libraries to manipulate the data

**Note:** If libraries are not yet installed run `pip install -r requirements.txt` from the parent directory

In [1]:
import os
import json
import numpy
import datetime
import pandas as pd

import pymongo
from sqlalchemy import create_engine

### Set up connection details

In [2]:
mysql_uid = "gab8un"
mysql_pwd = "Passw0rd123"
mysql_url = "mysql-gab8un.mysql.database.azure.com"

atlas_cluster_name = "cluster0.q0atcnd"
atlas_user_name = "gab8un"
atlas_password = "XwlxqfpWD8PxodaA"

mongo_connections = {
    "local": f"mongodb://localhost:27017/",
    "atlas": f"mongodb+srv://{atlas_user_name}:{atlas_password}@{atlas_cluster_name}.mongodb.net"
}

conn_str = mongo_connections['atlas']
source_db = 'sakila'

### Create functions to interact with SQL and Mongodb

In [3]:
def get_sql_dataframe(user_id, pwd, db_url, db_name, sql_query):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{db_url}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    conn = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, conn);
    conn.close()
    
    return dframe


def get_mongo_dataframe(connect_str, db_name, collection, query):
    '''Create a connection to MongoDB'''
    client = pymongo.MongoClient(connect_str)
    
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    client.close()
    return dframe


def set_dataframe(user_id, pwd, db_url, db_name, df, table_name, pk_column, db_operation):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{db_url}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()
    

### Retrieve data from the database
data is retrieved from the database and converted to json. One of the files is written to disk, and the other is exported to a mongoDB database

In [4]:
film_sql = "SELECT * FROM sakila.film";
film_df = customer_df = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_url, source_db, film_sql)
film_df.head(5)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42


In [5]:

customer_sql = "SELECT * FROM sakila.customer"
customer_df = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_url, source_db, customer_sql)
customer_df.head(5)



Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-15 04:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-15 04:57:20
2,3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7,1,2006-02-14 22:04:36,2006-02-15 04:57:20
3,4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8,1,2006-02-14 22:04:36,2006-02-15 04:57:20
4,5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9,1,2006-02-14 22:04:36,2006-02-15 04:57:20


### Output data to sources

In [6]:

film_file = open('films.json', 'w+')
film_file.write(film_df.to_json(orient='records'))
film_file.close()


In [8]:
mongo_client = pymongo.MongoClient(conn_str)
db = mongo_client[source_db]
collection_name = "customers"
db.drop_collection(collection_name)
json_obj = json.loads(customer_df.to_json(orient='records'))
collection = db[collection_name]
collection.insert_many(json_obj)

<pymongo.results.InsertManyResult at 0x7f98eed45d80>