In [4]:
# run this to shorten the data import from the files
import os
cwd = os.path.dirname(os.getcwd())+'/'
path_data = os.path.join(os.path.dirname(os.getcwd()), 'datasets/')


In [None]:
# exercise 01

"""
Ingesting JSON data with pandas

When developing a data pipeline, you may have to work with non-tabular data and data sources, such as APIs or JSON files. In this exercise, we'll practice extracting data from a JSON file using pandas.

pandas has been imported as pd, and the JSON file you'll ingest is stored at the path "testing_scores.json".
"""

# Instructions

"""

    Update the extract() function read a JSON file into a pandas DataFrame, orienting by records.

    Pass the path testing_scores.json to the extract() function, and store the output to a variable called raw_testing_scores.

    Print the head of the raw_testing_scores DataFrame.

"""

# solution

def extract(file_path):
  # Read the JSON file into a DataFrame
  return pd.read_json(file_path, orient="records")

# Call the extract function with the appropriate path, assign to raw_testing_scores
raw_testing_scores = extract('testing_scores.json')

# Output the head of the DataFrame
print(raw_testing_scores.head())


#----------------------------------#

# Conclusion

"""
Excellent data extracting! Ingestiong JSON files into pandas DataFrames is the first step in preparing non-tabular data for further transformation.
"""

'/home/nero/Documents/Estudos/DataCamp'

In [1]:
# exercise 02

"""
Reading JSON data into memory

When data is stored in JSON format, it's not always easy to load into a DataFrame. This is the case for the "nested_testing_scores.json" file. Here, the data will have to be manually manipulated before it can be stored in a DataFrame.

To help get you started, pandas has been loaded into the workspace as pd.
"""

# Instructions

"""


    Use pandas to read the JSON file into a DataFrame.
    Pass the "nested_scores.json" file to the extract() function.
---
    Import the json library.
    Use the json library to load the "nested_scores.json" file into memory.

"""

# solution

def extract(file_path):
  	# Read the JSON file into a DataFrame, orient by index
	return pd.read_json(file_path, orient="index")

# Call the extract function, pass in the desired file_path
raw_testing_scores = extract("nested_scores.json")
print(raw_testing_scores.head())


#----------------------------------#

# Import the json library
import json

def extract(file_path):
    with open(file_path, "r") as json_file:
        # Load the data from the JSON file
        raw_data = json.load(json_file)
    return raw_data

raw_testing_scores = extract("nested_scores.json")

# Print the raw_testing_scores
print(raw_testing_scores)


#----------------------------------#

# Conclusion

"""
You're off to a great start! The data from the JSON file has been loaded into a dictionary in-memory.
"""

'\n\n'

In [2]:
# exercise 03

"""
Iterating over dictionaries

Once JSON data is loaded into a dictionary, you can leverage Python's built-in tools to iterate over its keys and values.

The "nested_school_scores.json" file has been read into a dictionary stored in the raw_testing_scores variable, which takes the following form:

{
    "01M539": {
        "street_address": "111 Columbia Street",
        "city": "Manhattan",
        "scores": {
              "math": 657,
              "reading": 601,
              "writing": 601
        }
  }, ...
}

"""

# Instructions

"""


    Loop through the keys of the raw_testing_scores dictionary.
    Add each key to the raw_testing_scores_keys list.
---
    Now, loop through a list of values from the raw_testing_scores dictionary.
---
    Finally, loop through both the keys and values of the raw_testing_scores dictionary, simultaneously.

"""

# solution

raw_testing_scores_keys = []

# Iterate through the keys of the raw_testing_scores dictionary
for school_id in raw_testing_scores.keys():
  	# Append each key to the raw_testing_scores_keys list
	raw_testing_scores_keys.append(school_id)
    
print(raw_testing_scores_keys[0:3])


#----------------------------------#

raw_testing_scores_values = []

# Iterate through the values of the raw_testing_scores dictionary
for school_info in raw_testing_scores.values():
	raw_testing_scores_values.append(school_info)
    
print(raw_testing_scores_values[0:3])


#----------------------------------#

raw_testing_scores_keys = []
raw_testing_scores_values = []

# Iterate through the values of the raw_testing_scores dictionary
for school_id, school_info in raw_testing_scores.items():
	raw_testing_scores_keys.append(school_id)
	raw_testing_scores_values.append(school_info)

print(raw_testing_scores_keys[0:3])
print(raw_testing_scores_values[0:3])


#----------------------------------#

# Conclusion

"""
Great iteration! Iterating through the both the keys and values of dictionaries lays a foundation for working with non-tabluar JSON data. Keep up the good work!
"""

'\n\n'

In [3]:
# exercise 04

"""
Parsing data from dictionaries

When JSON data is loaded into memory, the resulting dictionary can be complicated. Key-value pairs may contain another dictionary, such are called nested dictionaries. These nested dictionaries are frequently encountered when dealing with APIs or other JSON data. In this exercise, you will practice extracting data from nested dictionaries and handling missing values.

The dictionary below is stored in the school variable. Good luck!

{
    "street_address": "111 Columbia Street",
    "city": "Manhattan",
    "scores": {
        "math": 657,
        "reading": 601
    }
}

"""

# Instructions

"""

    Parse the value stored at the "street_address" key from the school dictionary.

    Parse the value stored at the "scores" key from the school dictionary.

    Parse the values stored at the "math", "reading", and "writing" keys from the scores dictionary, and set the default value to 0.

"""

# solution

# Parse the street_address from the dictionary
street_address = school.get("street_address")

# Parse the scores dictionary
scores = school.get("scores")

# Try to parse the math, reading and writing values from scores
math_score = scores.get("math", 0)
reading_score = scores.get('reading',0)
writing_score = scores.get('writing',0)

print(f"Street Address: {street_address}")
print(f"Math: {math_score}, Reading: {reading_score}, Writing: {writing_score}")


#----------------------------------#

# Conclusion

"""
Great work! Understanding how to pull data from nested dictionaries is a valuable skill when working with non-tabluar data.
"""

'\n\n'

In [4]:
# exercise 05

"""
Transforming JSON data

Chances are, when reading data from JSON format into a dictionary, you'll probably have to apply some level of manual transformation to the data before it can be stored in a DataFrame. This is common when working with nested dictionaries, which you'll have the opportunity to explore in this exercise.

The "nested_school_scores.json" file has been read into a dictionary available in the raw_testing_scores variable, which takes the following form:

{
    "01M539": {
        "street_address": "111 Columbia Street",
        "city": "Manhattan",
        "scores": {
              "math": 657,
              "reading": 601,
              "writing": 601
        }
  }, ...
}

"""

# Instructions

"""


    Loop through both the keys and values of the raw_testing_scores dictionary.

    Extract the "street_address" from each dictionary nested in the raw_testing_scores object.

"""

# solution

normalized_testing_scores = []

# Loop through each of the dictionary key-value pairs
for school_id, school_info in raw_testing_scores.items():
	normalized_testing_scores.append([
    	school_id,
    	school_info.get("street_address"),  # Pull the "street_address"
    	school_info.get("city"),
    	school_info.get("scores").get("math", 0),
    	school_info.get("scores").get("reading", 0),
    	school_info.get("scores").get("writing", 0),
    ])

print(normalized_testing_scores)


#----------------------------------#

# Conclusion

"""
Outstanding! Using the json library and native-Python, you've extracted the JSON file into a list of lists.
"""

'\n\n'

In [5]:
# exercise 06

"""
Transforming and cleaning DataFrames

Once data has been curated into a cleaned Python data structure, such as a list of lists, it's easy to convert this into a pandas DataFrame. You'll practice doing just this with the data that was curated in the last exercise.

Per usual, pandas has been imported as pd, and the normalized_testing_scores variable stores the list of each schools testing data, as shown below.

[
    ['01M539', '111 Columbia Street', 'Manhattan', 657.0, 601.0, 601.0],
    ...
]   

"""

# Instructions

"""


    Create a pandas DataFrame from the list of lists stored in the normalized_testing_scores variable.

    Set the columns names for the normalized_data DataFrame.


"""

# solution

# Create a DataFrame from the normalized_testing_scores list
normalized_data = pd.DataFrame(normalized_testing_scores)

# Set the column names
normalized_data.columns = ["school_id", "street_address", "city", "avg_score_math", "avg_score_reading", "avg_score_writing"]

normalized_data = normalized_data.set_index("school_id")
print(normalized_data.head())


#----------------------------------#

# Conclusion

"""
Congrats! You've extracted a JSON file, and manipulated it such that it can be stored in a pandas DataFrame for downstream transformation.
"""

'\n\n'

In [6]:
# exercise 07

"""
Filling missing values with pandas

When building data pipelines, it's inevitable that you'll stumble upon missing data. In some cases, you may want to remove these records from the dataset. But in others, you'll need to impute values for the missing information. In this exercise, you'll practice using pandas to impute missing test scores.

Data from the file "testing_scores.json" has been read into a DataFrame, and is stored in the variable raw_testing_scores. In addition to this, pandas has been loaded as pd.
"""

# Instructions

"""


    Print the head of the raw_testing_scores DataFrame, and observe the NaN values.
---


    Use the average of the "math_score" column to fill the NaN values in the "math_score" column.
    Print the head of the updated DataFrame.
---


    For the "math_score", "reading_score" and "writing_score" columns, update the transform() function to fill NaN values with the mean of the respective columns, in place.
    Print the head of the cleaned DataFrame.

"""

# solution

# Print the head of the `raw_testing_scores` DataFrame
print(raw_testing_scores.head())


#----------------------------------#

# Fill NaN values with the average from that column
raw_testing_scores["math_score"] = raw_testing_scores["math_score"].fillna(raw_testing_scores["math_score"].mean())

# Print the head of the raw_testing_scores DataFrame
print(raw_testing_scores.head())


#----------------------------------#

def transform(raw_data):
	raw_data.fillna(
    	value={
			# Fill NaN values with column mean
			"math_score": raw_data["math_score"].mean(),
			"reading_score": raw_data['reading_score'].mean(),
			"writing_score": raw_data['writing_score'].mean()
		}, inplace=True
	)
	return raw_data

clean_testing_scores = transform(raw_testing_scores)

# Print the head of the clean_testing_scores DataFrame
print(clean_testing_scores.head())

#----------------------------------#

# Conclusion

"""
Nicely done! Working with missing values is something that takes practice, and an understanding of the problem at hand. Thanks to pandas, it's easy to implement a wide variety of logic using the .fillna() method. Keep up the great work!
"""

'\n\n'

In [7]:
# exercise 08

"""
Grouping data with pandas

The output of a data pipeline is typically a "modeled" dataset. This dataset provides data consumers easy access to information, without having to perform much manipulation. Grouping data with pandas helps to build modeled datasets,

pandas has been imported as pd, and the raw_testing_scores DataFrame contains data in the following form:

              street_address       city  math_score  reading_score  writing_score
01M539   111 Columbia Street  Manhattan       657.0          601.0          601.0
02M294      350 Grand Street  Manhattan       395.0          411.0          387.0
02M308      350 Grand Street  Manhattan       418.0          428.0          415.0

"""

# Instructions

"""


    Use .loc[] to only keep the "city", "math_score", "reading_score", and "writing_score" columns.
    Group the DataFrame by the "city" column, and find the mean of each city's math, reading, and writing scores.
    Use the transform() function to create a grouped DataFrame.

"""

# solution

def transform(raw_data):
	# Use .loc[] to only return the needed columns
	raw_data = raw_data.loc[:, ['city','math_score','reading_score','writing_score']]
	
    # Group the data by city, return the grouped DataFrame
	grouped_data = raw_data.groupby(by=["city"], axis=0).mean()
	return grouped_data

# Transform the data, print the head of the DataFrame
grouped_testing_scores = transform(raw_testing_scores)
print(grouped_testing_scores.head())


#----------------------------------#

# Conclusion

"""
Great grouping! Leveraging pandas' aggregation capabilities help to create report-ready datasets for downstream data consumers.
"""

'\n\n'

In [None]:
def find_street_name(row):
	# Split the street_address by spaces
    split_street_address = row["street_address"].split(" ")
    
    # Remove the number
    street_number = split_street_address[0]
    try:
    	int(street_number)
    except ValueError:
    	return row["street_address"]
      
    return " ".join(split_street_address[1:])

In [8]:
# exercise 09

"""
Applying advanced transformations to DataFrames

pandas has a plethora of built-in transformation tools, but sometimes, more advanced logic needs to be used in a transformation. The apply function lets you apply a user-defined function to a row or column of a DataFrame, opening the door for advanced transformation and feature generation.

The find_street_name() function parses the street name from the "street_address", dropping the street number from the string. This function has been loaded into memory, and is ready to be applied to the raw_testing_scores DataFrame.
"""

# Instructions

"""


    In the definition of the transform() function, use the find_street_name() function to create a new column with the name "street_name".

    Use the transform() function to clean the raw_testing_scores DataFrame.

    Print the head of the cleaned_testing_scores DataFrame, observing the new "street_name" column.

"""

# solution

def transform(raw_data):
	# Use the apply function to extract the street_name from the street_address
    raw_data["street_name"] = raw_data.apply(
   		# Pass the correct function to the apply method
        find_street_name,
        axis=1
    )
    return raw_data

# Transform the raw_testing_scores DataFrame
cleaned_testing_scores = transform(raw_testing_scores)

# Print the head of the cleaned_testing_scores DataFrame
print(cleaned_testing_scores.head())


#----------------------------------#

# Conclusion

"""
Amazing stuff! Being able to 'apply' functions to a DataFrame can really help streamline data transformation, especially when the logic in a transformation is more complex than pandas can handle with its built-in functionality.
"""

'\n\n'

In [9]:
# exercise 10

"""
Loading data to a Postgres database

After data has been extracted from a source system and transformed to align with analytics or reporting use cases, it's time to load the data to a final storage medium. Storing cleaned data in a SQL database makes it simple for data consumers to access and run queries against. In this example, you'll practice loading cleaned data to a Postgres database.

sqlalchemy has been imported, and pandas is available as pd. The first few rows of the cleaned_testing_scores DataFrame are shown below:

             street_address       city  math_score  ... best_score
01M539  111 Columbia Street  Manhattan       657.0      Math
02M545     350 Grand Street  Manhattan       613.0      Math
01M292     220 Henry Street  Manhattan       410.0      Math

"""

# Instructions

"""


    Update the connection string to write to the schools database and create a connection object using sqlalchemy.
 
    Use pandas to write the cleaned_testing_scores DataFrame to the scores table in the schools database.
 
    If the table is already populated with data, make sure to replace the values with the current DataFrame.

"""

# solution

# Update the connection string, create the connection object to the schools database
db_engine = sqlalchemy.create_engine("postgresql+psycopg2://repl:password@localhost:5432/schools")

# Write the DataFrame to the scores_by_city table
cleaned_testing_scores.to_sql(
	name="scores",
	con=db_engine,
	index=False,
	if_exists="replace"
)


#----------------------------------#

# Conclusion

"""
Lovely loading! The .to_sql() method is a powerful tool that helps to simplify the 'load' component of ETL pipelines.
"""

'\n\n'

In [None]:
# exercise 11

"""
Validating data loaded to a Postgres Database

In this exercise, you'll finally get to build a data pipeline from end-to-end. This pipeline will extract school testing scores from a JSON file and transform the data to drop rows with missing scores. In addition to this, each will be ranked by the city they are located in, based on their total scores. Finally, the transformed dataset will be stored in a Postgres database.

To give you a head start, the extract() and transform() functions have been built and used as shown below. In addition to this, pandas has been imported as pd. Best of luck!

# Extract and clean the testing scores.
raw_testing_scores = extract("testing_scores.json")
cleaned_testing_scores = transform(raw_testing_scores)

"""

# Instructions

"""

    Update the load() function to write the clean_data DataFrame to the scores_by_city table in the schools database.
    If data exists in the scores_by_city table, makes sure to replace it with the updated data.
---

    Load the data from the cleaned_testing_scores, using the db_engine that has already been defined.
    Use pandas to read data from the scores_by_city table, and print the first few rows of the DataFrame to validate that data was persisted.


"""

# solution

def load(clean_data, con_engine):
	# Store the data in the schools database
    clean_data.to_sql(
    	name="scores_by_city",
		con=con_engine,
		if_exists="replace",  # Make sure to replace existing data
		index=True,
		index_label="school_id"
    )


#----------------------------------#

def load(clean_data, con_engine):
    clean_data.to_sql(name="scores_by_city", con=con_engine, if_exists="replace", index=True, index_label="school_id")
    
# Call the load function, passing in the cleaned DataFrame
load(cleaned_testing_scores, db_engine)

# Call query the data in the scores_by_city table, check the head of the DataFrame
to_validate = pd.read_sql("SELECT * FROM scores_by_city", con=db_engine)
print(to_validate.head())


#----------------------------------#

# Conclusion

"""
Take a second to enjoy this! You just built a data pipeline that extracts data from a JSON file, transforms it, and stores it in a Postgres database for easy downstream access. Congrats!
"""