In [1]:
import pandas as pd

# Input and Output Intro (section 12.160):

- This section goes over how to import data into Jupyter Notebook and how to export it back out
- How do you write your df to a csv file, an excel file, etc?

# Pass a URL to the pd.read_csv Method (section 12.161):

- how to get pandas to download a dataset for us from the internet
- lets you access data that is frequently updated without having to download it every time it is updated

In [None]:
# copy the link address to the csv that you want to import
# usually want to delete everything after '.csv' when you paste the link
# original: https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv?accessType=DOWNLOAD
# after deleting after .csv: https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv
# ^ this is what you want to use
# the url we're using is from a real data set
# takes a little longer because we are getting the data from the internet

In [6]:
url = "https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv"
pd.read_csv(url)
# can use a variable or can just use the whole url as a string
# Example: pd.read_csv("https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv")
# store the data as a variable to save it
baby_names = pd.read_csv(url)
baby_names.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


# Quick Object Conversions (section 12.162):

- How to convert a pandas object such as a series or df to a vanilla python object such as a list, dictionary, or string

In [7]:
url = "https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv"
baby_names = pd.read_csv(url)
baby_names.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [9]:
# how to convert a series to a df, unrelated to current lesson: 
baby_names["Child's First Name"].to_frame

<bound method Series.to_frame of 0        GERALDINE
1              GIA
2           GIANNA
3          GISELLE
4            GRACE
           ...    
37532       Yousef
37533      Youssef
37534        Yusuf
37535      Zachary
37536          Zev
Name: Child's First Name, Length: 37537, dtype: object>

In [10]:
# convert a pandas series to a python list (one word, no underscore):
baby_names["Child's First Name"].tolist()

['GERALDINE',
 'GIA',
 'GIANNA',
 'GISELLE',
 'GRACE',
 'GUADALUPE',
 'HAILEY',
 'HALEY',
 'HANNAH',
 'HAYLEE',
 'HAYLEY',
 'HAZEL',
 'HEAVEN',
 'HEIDI',
 'HEIDY',
 'HELEN',
 'IMANI',
 'INGRID',
 'IRENE',
 'IRIS',
 'ISABEL',
 'ISABELA',
 'ISABELLA',
 'ISABELLE',
 'ISIS',
 'ITZEL',
 'IZABELLA',
 'JACQUELINE',
 'JADA',
 'JADE',
 'JAELYNN',
 'JAMIE',
 'JANELLE',
 'JASLENE',
 'JASMIN',
 'JASMINE',
 'JAYDA',
 'JAYLA',
 'JAYLAH',
 'JAYLEEN',
 'JAYLENE',
 'JAYLIN',
 'JAYLYN',
 'JAZLYN',
 'JAZMIN',
 'JAZMINE',
 'JENNIFER',
 'JESSICA',
 'JIMENA',
 'JOCELYN',
 'JOHANNA',
 'JOSELYN',
 'JULIA',
 'JULIANA',
 'JULIANNA',
 'JULIET',
 'JULIETTE',
 'JULISSA',
 'KAELYN',
 'KAILEY',
 'KAILYN',
 'KAITLYN',
 'KAMILA',
 'KAREN',
 'KARLA',
 'KATE',
 'KATELYN',
 'KATELYNN',
 'KATHERINE',
 'KATIE',
 'KAYLA',
 'KAYLEE',
 'KAYLEEN',
 'KAYLEIGH',
 'KAYLIE',
 'KAYLIN',
 'KEILY',
 'KELLY',
 'KEYLA',
 'KHLOE',
 'KIARA',
 'KIMBERLY',
 'KRYSTAL',
 'KYLEE',
 'KYLIE',
 'LAILA',
 'LAURA',
 'LAUREN',
 'LAYLA',
 'LEA',
 'L

In [11]:
# how to convert a pandas series to a python dict (has an underscore): 
baby_names["Child's First Name"].to_dict()
# takes the index position as the key, column value becomes the dict value
# Python dictionaries prohibit duplicate keys(can't have duplicate index values for this example),
# takes the last one that it finds and keeps that key value pair
# make sure your each value in the index is unique before converting to a dictionary

{0: 'GERALDINE',
 1: 'GIA',
 2: 'GIANNA',
 3: 'GISELLE',
 4: 'GRACE',
 5: 'GUADALUPE',
 6: 'HAILEY',
 7: 'HALEY',
 8: 'HANNAH',
 9: 'HAYLEE',
 10: 'HAYLEY',
 11: 'HAZEL',
 12: 'HEAVEN',
 13: 'HEIDI',
 14: 'HEIDY',
 15: 'HELEN',
 16: 'IMANI',
 17: 'INGRID',
 18: 'IRENE',
 19: 'IRIS',
 20: 'ISABEL',
 21: 'ISABELA',
 22: 'ISABELLA',
 23: 'ISABELLE',
 24: 'ISIS',
 25: 'ITZEL',
 26: 'IZABELLA',
 27: 'JACQUELINE',
 28: 'JADA',
 29: 'JADE',
 30: 'JAELYNN',
 31: 'JAMIE',
 32: 'JANELLE',
 33: 'JASLENE',
 34: 'JASMIN',
 35: 'JASMINE',
 36: 'JAYDA',
 37: 'JAYLA',
 38: 'JAYLAH',
 39: 'JAYLEEN',
 40: 'JAYLENE',
 41: 'JAYLIN',
 42: 'JAYLYN',
 43: 'JAZLYN',
 44: 'JAZMIN',
 45: 'JAZMINE',
 46: 'JENNIFER',
 47: 'JESSICA',
 48: 'JIMENA',
 49: 'JOCELYN',
 50: 'JOHANNA',
 51: 'JOSELYN',
 52: 'JULIA',
 53: 'JULIANA',
 54: 'JULIANNA',
 55: 'JULIET',
 56: 'JULIETTE',
 57: 'JULISSA',
 58: 'KAELYN',
 59: 'KAILEY',
 60: 'KAILYN',
 61: 'KAITLYN',
 62: 'KAMILA',
 63: 'KAREN',
 64: 'KARLA',
 65: 'KATE',
 66: 'KATE

In [26]:
# how to convert a series to a string, sort it alphabetically, remove duplicates, and put in title case:

", ".join(baby_names["Child's First Name"].str.title().drop_duplicates().sort_values())


"Aahil, Aaliyah, Aarav, Aaron, Aarya, Aaryan, Aayan, Abby, Abdiel, Abdoul, Abdoulaye, Abdul, Abdullah, Abe, Abel, Abigail, Aboubacar, Abraham, Abrar, Abrielle, Abril, Ace, Ada, Adalynn, Adam, Adan, Addison, Adelaide, Adele, Adelina, Adeline, Adelyn, Aden, Adiel, Adina, Aditya, Adonis, Adrian, Adriana, Adrianna, Adriel, Adyan, Aharon, Ahmad, Ahmed, Ahnaf, Ahron, Ahuva, Aicha, Aidan, Aiden, Aileen, Aimee, Aisha, Aissata, Aissatou, Aitana, Aiza, Aizah, Akiva, Alaia, Alaina, Alan, Alana, Alani, Alanis, Alanna, Alayna, Alba, Albert, Alberto, Aldo, Aleah, Alec, Aleena, Alejandra, Alejandro, Aleksander, Aleksandra, Alessandra, Alessandro, Alessia, Alex, Alexa, Alexander, Alexandra, Alexandria, Alexia, Alexis, Alfred, Alfredo, Ali, Alia, Aliah, Alice, Alicia, Alijah, Alina, Alisa, Alisha, Alison, Alissa, Alisson, Aliyah, Aliza, Allan, Allen, Allison, Allyson, Alma, Alondra, Alonso, Alpha, Alston, Alter, Alvin, Alyson, Alyssa, Amadou, Amaia, Amalia, Amanda, Amani, Amar'E, Amara, Amare, Amari, A

In [19]:
# join method example:
list = ["a", "b", "c"]
"!".join(list)

'a!b!c'

# Export a CSV File with the to_csv method (section 12.163):


- Reverse of importing
- When pandas encounters an encoding error, that means that pandas is encountering a character somewhere in our rows that it doesn't know how to write

In [27]:
url = "https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv"
baby_names = pd.read_csv(url)
baby_names.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [30]:
# need to provide to_csv a string representing the file name
# need the .csv extention in the string
# good practice to avoid spaces in file names

baby_names.to_csv("NYC_Baby_Names.csv")
# pandas automatically writes the df contents to a csv file, its now in my pandas folder

# how to not include the index: 
baby_names.to_csv("NYC_Baby_Names.csv", index = False)
# pandas will overwrite if a file with the same name already exists
# make sure the file name isn't something you don't want to replace

# How to customize the columns we include in the export:
baby_names.to_csv("NYC_Baby_Names.csv", columns = ["Gender", "Ethnicity", "Child's First Name"])


In [None]:
# how to deal with encoding errors:

baby_names.to_csv("NYC_Baby_Names.csv", encoding = "utf-8")

# utf-8 is a popular character set in coding scheme that supports common characters in english/western launguages
# if 'encoding = utf-8' doesn't work, try searching google/stack overflow
# ^ should be as simple as swapping 'utf-8' with the proper encoding

##  ** Install xlrd and openpyxl to Read and Write Excel Files **

- installed in our anaconda environment in this course, may still need to install in other ides/ on other computers

# Import Excel Files into pandas with the read_excel Method (section 12.165):

- can import excel files with one or more worksheets

In [2]:
# First arguement should be a string representing the file name, name must match exactly
# Have to provide the file extention (excel extention is .xlss)

df = pd.read_excel("Data - Single Worksheet.xlsx")

# For excel files with only one worksheet, pandas brings it in as a pandas df
# now we can do everything we can normally do to a pandas df to it
# can use most of the same parameters as read.csv such as index_col, squeeze, etc when importing

Unnamed: 0,First Name,Last Name,City,Gender
0,Brandon,James,Miami,M
1,Sean,Hawkins,Denver,M
2,Judy,Day,Los Angeles,F
3,Ashley,Ruiz,San Francisco,F
4,Stephanie,Gomez,Portland,F


In [21]:
# Importing an excel workbook with multiple worksheets:

pd.read_excel("Data - Multiple Worksheets.xlsx")
# without addtional arguements, pandas will only import the first worksheet from the excel file

# use the 'sheet_name' parameter to change which worksheet is imported, default arguement is 0
# can provide 'sheet_name' with the index position or the sheet's name
pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = 1)
pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = "Data 2")

# to import multiple worksheets, provide a list to the 'sheet_name' parameter:

pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = [0, 1])
data = pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = ["Data 1", "Data 2"])
# stores the sheets in a python dictionary, not a df
type(data)

dict

In [28]:
# the sheets index positions are the dictionary keys, the dictionary values are the respective dataframes
# to extract the df, index to the key you want in the dictionary and pull it out
# example:

data = pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = [0,1])
# ^ better to use sheet names instead of index positions bc then the keys to call the sheets are the names too
data[0]
data[1]
# data["Data 1"] won't work bc the keys are the index positions due to the way we called them

data = pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = ["Data 1", "Data 2"])
data["Data 1"]
data["Data 2"]
# since we called the sheet_names with strings, we can use the names to call the individual sheets

Unnamed: 0,First Name,Last Name,City,Gender
0,Parker,Power,Raleigh,F
1,Preston,Prescott,Philadelphia,F
2,Ronaldo,Donaldo,Bangor,M
3,Megan,Stiller,San Francisco,M
4,Bustin,Jieber,Austin,F


In [None]:
# How to automatically import every worksheet from an excel file:
 
data = pd.read_excel("Data - Multiple Worksheets.xlsx", sheet_name = None)
# putting sheet name as 'none' imports all worksheets as a dict, keys are the worksheet names, values are the dfs

# Export Excel Files with the to_excel method (section 12.166):

- Taking one or more pandas objects and writing them to an excel .xlsx file

In [30]:
url = "https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv"
baby_names = pd.read_csv(url)
baby_names.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [36]:
girls = baby_names[baby_names["Gender"] == "FEMALE"]
boys = baby_names[baby_names["Gender"] == "MALE"]

In [35]:
girls.head(3)

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42


In [34]:
boys.head(3)

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
545,2011,MALE,ASIAN AND PACIFIC ISLANDER,AARAV,15,51
546,2011,MALE,ASIAN AND PACIFIC ISLANDER,AARON,51,19
547,2011,MALE,ASIAN AND PACIFIC ISLANDER,ABDUL,20,46


In [39]:
# more complicated than exporting to csv because we have to configure a lot of things such as:
# what the worksheets are going to be named, what order, etc

excel_file = pd.ExcelWriter("Baby_Names.xlsx")
# ^ this is an excel writer object, which is the excel workbook that we want to write to

girls.to_excel(excel_file, sheet_name = "Girls", index = False, )

# some parameters for to_excel: excel writer, sheet name
# set index to false to exclude the index, set to true to include it

boys.to_excel(excel_file, sheet_name = "Boys", index = False, columns = ["Year of Birth", "Gender", "Ethnicity"])
# ^ for this example we only want a few specific columns, all columns included by default

# order matters
# need to call .save() to the excel file to actually write it

excel_file.save()
#^ after this line, our excel file is finally created
# Jupyter Notebook can't read excel files, but can open it from the folder on the desktop :)