In [2]:
# FIX COLUMN HEADERS
# So far, we've imported CSV data with headrrs, however, headers being present won't always be the case:
# Here's an example of a file that we will need to fix:

# Import dependencies:
import pandas as pd
from pathlib import Path

# Create a CSV path DataFrame variable called `sales_data_df`
sales_data_df = pd.read_csv(Path("sales_no_header.csv"))
# Read the first 5 rows to ensure the DataFrame is working correctly
sales_data_df.head()

# Overall, the content of the DataFrame looks similar to that first one that we imported into.
# The index exists, and it appears that the data includes the full name, email, zip, and sale price.
# However, if you notice, there are no headers in each column, each column begins with a set of data.
# In this example, the original CSV file doesn't have a header in the first row.
# Pandas doesn't know this, so it assumes the first row of values is the header for each column in the DataFrame.
# We can fix this by adding the `header=None` parameter to the `read_csv` function.
# This parameter tells Pandas that the CSV file doesn't have a header, so it shouldn't automatically assign the first row of data as the header for the DataFrame.

Unnamed: 0,Elwanda White,alyre2036@live.com,9236,84.33
0,Lyndon Elliott,arrowy1873@outlook.com,1330,879.95
1,Daisey Sellers,toucan2024@outlook.com,7631,907.58
2,Issac Reeves,asarin1958@gmail.com,81168,545.88
3,Bradford Kinney,mibound1801@yandex.com,41721,517.49
4,Fermina Cobb,kingfisher2013@live.com,16625,889.95


In [9]:
# To change this code, we will implement the following:
sales_df = pd.read_csv(Path("sales_no_header.csv"), header=None)
sales_df.head()

# As you can see when you run this cell, the data no longer has a header.
# Pandas has replaced the header row with colunm index positions.
# We will not stop there, headers are vital, so we will need to add column names to organize our data.
# To do this, assign a new list of column names to the columns attribute of the DataFrame:
columns = ["Full Name", "Email", "Zip", "Sale Price"]
sales_df.columns = columns
sales_df.head()

# This code creates a new header according to the list that the code defines.
# The `sales_df.columns` attribute simply contains a list of strings that Pandas assigns to the columns based on the index position.

Unnamed: 0,Full Name,Email,Zip,Sale Price
0,Elwanda White,alyre2036@live.com,9236,84.33
1,Lyndon Elliott,arrowy1873@outlook.com,1330,879.95
2,Daisey Sellers,toucan2024@outlook.com,7631,907.58
3,Issac Reeves,asarin1958@gmail.com,81168,545.88
4,Bradford Kinney,mibound1801@yandex.com,41721,517.49


In [13]:
# RENAME COLUMNS
# Let's say that Pandas correctly imported our data with a proper header row, but we want to change a few of the column names to more accurately describe the data.
# If we need to replace only one or two column names, we use a Python dictionary to map the old column name to the new one:
{
    "Old Column Name": "New Column Name"
}

# This format tells Pandas to replace the original column name left of the colon with the new one on the right.
# For example, if we want to change a column name from Email to Email Account, we'd do the following:
sales_df = sales_df.rename(columns={
    "Email": "Email Account"
})
sales_df.head()

Unnamed: 0,Full Name,Email Account,Zip,Sale Price
0,Elwanda White,alyre2036@live.com,9236,84.33
1,Lyndon Elliott,arrowy1873@outlook.com,1330,879.95
2,Daisey Sellers,toucan2024@outlook.com,7631,907.58
3,Issac Reeves,asarin1958@gmail.com,81168,545.88
4,Bradford Kinney,mibound1801@yandex.com,41721,517.49


In [15]:
# USE THE SET_INDEX FUNCTION
# By default, Pandas creates an index for the rows in a DataFrame using numbers.
# While a numeric index can prove useful, we sometimes want to use labels for the rows instead.
# For example, the row index to the time or date often proves useful in finance.
# We'll explore how to analyze data by using row labels later, for now, we'll set the index to the date.
# One technique for resetting the row index is to use a Pandas function called `set_index`.
# With this function, we can choose any column from the CSV file to use for the row index lables.
# The following code sets the Date column, which contains years, as the row index:
# `sales_df.set_index("Date")`

In [21]:
# AUTOMATICALLY SET THE INDEX
# Another way to set the date or time as the index is to have Pandas automatically do it when it reads the CSV file.
# To do this, we just add a few parameters to the `read_csv` function:
sales_dataframe = pd.read_csv(
    Path("sales.csv"),
    index_col="Date",
    parse_dates=True,
    infer_datetime_format=True
)
sales_dataframe.head() 

# Let's break down this `read_csv` function:
    # The `index_col` parameter tells Pandas which column to use for the index labels.
    # The `parse_dates` parameter tells Pandas to try to parse the dates in the index column as a Datetimeindex.
    # The `infer_datetime_format` parameter tells Pandas whether to try to guess the format of the date/time data.
        # Note that dates and times can come in different formats.
        # If it does, it uses the most efficient algorithm to parse the data.
# Using these parameters with the `read_csv` function creates a special type of index called a `datetimeindex`.
# A `datetimeindex` uses dates and times instead of numbers or labels.
# This is used often to analyze financial events over a time period.

Unnamed: 0_level_0,Net Sales (mill),Net Income (mill),EPS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-09-30,229234,48351,$9.27
2018-09-29,265595,59531,$12.01
2019-09-28,260174,55256,$11.97
