# Dictionaries

## A python data structure taking the form of a key:value pair

In [6]:
dict = {"Key":"value"}

In [7]:
dict

{'Key': 'value'}

In [8]:
dict.keys

<function dict.keys>

In [9]:
dict.keys()

dict_keys(['Key'])

In [10]:
dict.values()

dict_values(['value'])

### Access a particular dictionary's key through this syntax

In [11]:
dict["Key"] = "Hello"
dict

{'Key': 'Hello'}

In [12]:
dict["Key"]

'Hello'

# Pandas dataframes

### 2D, mutable, potentially heterogeneous data structure as part of pandas a package built on top of numpy for more efficienct manipulation compared to using dictionary datastructure. Think of the structure as pivot table, tabular data, or SQL table.

##### There are plenty of built-in methods on dataframe

In [3]:
import pandas as pd

In [4]:
pd.__dict__

{'__name__': 'pandas',
 '__package__': 'pandas',
 '__loader__': <_frozen_importlib_external.SourceFileLoader at 0x2798184cb80>,
 '__spec__': ModuleSpec(name='pandas', loader=<_frozen_importlib_external.SourceFileLoader object at 0x000002798184CB80>, origin='C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\__init__.py', submodule_search_locations=['C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas']),
 '__path__': ['C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas'],
 '__file__': 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\__init__.py',
 '__cached__': 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\__pycache__\\__init__.cpython-39.pyc',
 '__builtins__': {'__name__': 'builtins',
  '__doc__': "Built-in functions, exceptions, and other objects.\n\nNoteworthy: None is the `nil' object; Ellipsis represents `...' in slices.",
  '__package__': '',
  '__loader__': _frozen_importlib.BuiltinImporter,
  '__spec__': ModuleSpec(name='builtins', loader=<class '

### Nested dictionary conversion to dataframe.
##### Simply pass the dataframe function the dictionary and assign it to a variable.

In [5]:
stateInfo = {'Colorado': 
            {'Population': 5812000,
            'Capital': 'Denver',
            'Founded': 'Feb. 28, 1861'},
             
            'North Dakota':
            {'Population':774948,
            'Capital':'Bismarck',
            'Founded': 'Nov. 2, 1889'}}

# Call the value residing in the 'Colorado > Capital' key value
stateInfo['Colorado']['Capital']

'Denver'

In [6]:
 # tpass Stateinfo to DataFrame method, transpose it by .T, and assign to variable
stateInfo = pd.DataFrame(stateInfo).T

In [7]:
stateInfo

Unnamed: 0,Population,Capital,Founded
Colorado,5812000,Denver,"Feb. 28, 1861"
North Dakota,774948,Bismarck,"Nov. 2, 1889"


In [8]:
# It is pretty obvious what this syntax does. A method called on the dataframe and passed the filename you want
stateInfo.to_excel("Chapter_3_in_class.xlsx")

### Creating a nested dictionary containing products using a for loop and converting to dataframe

In [9]:
min = 0
max = 100

productTable = {}

for i in range(min, max +1): # for each row;
    productTable[i] = {} # create a dictionary at that location in mem
    for j in range(min, max +1): #for each column
        productTable[i][j] = i * j # assign product to current loop iteration

productTable = pd.DataFrame(productTable)
productTable

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
2,0,2,4,6,8,10,12,14,16,18,...,182,184,186,188,190,192,194,196,198,200
3,0,3,6,9,12,15,18,21,24,27,...,273,276,279,282,285,288,291,294,297,300
4,0,4,8,12,16,20,24,28,32,36,...,364,368,372,376,380,384,388,392,396,400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,96,192,288,384,480,576,672,768,864,...,8736,8832,8928,9024,9120,9216,9312,9408,9504,9600
97,0,97,194,291,388,485,582,679,776,873,...,8827,8924,9021,9118,9215,9312,9409,9506,9603,9700
98,0,98,196,294,392,490,588,686,784,882,...,8918,9016,9114,9212,9310,9408,9506,9604,9702,9800
99,0,99,198,297,396,495,594,693,792,891,...,9009,9108,9207,9306,9405,9504,9603,9702,9801,9900


##### Export dictionary as csv

In [10]:
facultyDict = {
    "Caton, James":
    {
        "Position":"Assistant Professor",
        "Email":"james.caton@ndsu.edu",
        "Phone":"701-231-7337"
    },
     "Englund, David":
    {
        "Position":"Senior Lecturer",
        "Email":"david.englund@ndsu.edu",
        "Phone":"701-231-9797"
    },
    "Dean, James":
    {
        "Position":"Assistant Professor",
        "Email":"james.dean.1@ndsu.edu",
        "Phone":"701-231-6641"
    }   
}

facultyDict

{'Caton, James': {'Position': 'Assistant Professor',
  'Email': 'james.caton@ndsu.edu',
  'Phone': '701-231-7337'},
 'Englund, David': {'Position': 'Senior Lecturer',
  'Email': 'david.englund@ndsu.edu',
  'Phone': '701-231-9797'},
 'Dean, James': {'Position': 'Assistant Professor',
  'Email': 'james.dean.1@ndsu.edu',
  'Phone': '701-231-6641'}}

In [11]:
pd.DataFrame(facultyDict).T

Unnamed: 0,Position,Email,Phone
"Caton, James",Assistant Professor,james.caton@ndsu.edu,701-231-7337
"Englund, David",Senior Lecturer,david.englund@ndsu.edu,701-231-9797
"Dean, James",Assistant Professor,james.dean.1@ndsu.edu,701-231-6641


### Import html page as dataframe

In [12]:
table = pd.read_html('https://www.ndsu.edu/agriculture/academics/faculty-and-staff', match='Bangsund, Dean')

In [13]:
df = table[0]
df.head()

Unnamed: 0,Name,Position,Email,Phone
0,"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu,Phone 701-231-7471
1,"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu,Phone 701-231-7379
2,"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu,Phone 701-231-8672
3,"Carney, Jennifer",Student Services Director,Jennifer.Carney@ndsu.edu,Phone 701-231-7442
4,"Caton, James",Assistant Professor,james.caton@ndsu.edu,Phone 701-231-7337


### Data Ingestion

In [14]:
tableClean = pd.read_csv("copyPasteFacultyList.csv")
tableClean

Unnamed: 0,Name,Position,Email,Phone
0,"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu,Phone
1,,,,701-231-7471
2,"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu,Phone
3,,,,701-231-7379
4,"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu,Phone
...,...,...,...,...
65,,,,701-231-7443
66,"Wachenheim, Cheryl",Professor,Cheryl.Wachenheim@ndsu.edu,Phone
67,,,,701-231-7452
68,"Wilson, William",Distinguished Professor,William.Wilson@ndsu.edu,Phone


### Data Cleaning

##### Use dropna function to remove null values. Save Phone column to separate table, del column from original table

In [15]:
cleanPhoneNums = tableClean[['Phone']]
del tableClean["Phone"]

In [16]:
tableClean.dropna()
tableClean.head()

Unnamed: 0,Name,Position,Email
0,"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu
1,,,
2,"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu
3,,,
4,"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu


### pd.dropna()

In [17]:
tableClean = tableClean.dropna()  # Drop null values
cleanPhoneNums.head(10)

Unnamed: 0,Phone
0,Phone
1,701-231-7471
2,Phone
3,701-231-7379
4,Phone
5,701-231-8672
6,Phone
7,701-231-7442
8,Phone
9,701-231-7337


In [18]:
tableClean.head()

Unnamed: 0,Name,Position,Email
0,"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu
2,"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu
4,"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu
6,"Carney, Jennifer",Student Services Director,Jennifer.Carney@ndsu.edu
8,"Caton, James",Assistant Professor,james.caton@ndsu.edu


### Use iloc to identify observations

In [19]:
# A phone number is in the second position, then every 3 after that

cleanPhoneNums.iloc[1::2] # Start at 1, count by three
# Same syntax as list values

Unnamed: 0,Phone
1,701-231-7471
3,701-231-7379
5,701-231-8672
7,701-231-7442
9,701-231-7337
11,701-231-9797
13,701-231-6641
15,701-231-7393
17,701-231-5747
19,701-231-8103


In [20]:
# Place these values in the original dataframe in a column named 'Phones'

In [21]:
tableClean['Phone Numbers'] = cleanPhoneNums["Phone"][1::2].values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tableClean['Phone Numbers'] = cleanPhoneNums["Phone"][1::2].values


In [22]:
tableClean.head()

Unnamed: 0,Name,Position,Email,Phone Numbers
0,"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu,701-231-7471
2,"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu,701-231-7379
4,"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu,701-231-8672
6,"Carney, Jennifer",Student Services Director,Jennifer.Carney@ndsu.edu,701-231-7442
8,"Caton, James",Assistant Professor,james.caton@ndsu.edu,701-231-7337


In [23]:
tableClean = tableClean.set_index("Name")
tableClean

Unnamed: 0_level_0,Position,Email,Phone Numbers
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu,701-231-7471
"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu,701-231-7379
"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu,701-231-8672
"Carney, Jennifer",Student Services Director,Jennifer.Carney@ndsu.edu,701-231-7442
"Caton, James",Assistant Professor,james.caton@ndsu.edu,701-231-7337
"Dean, James",Assistant Professor of Economics,james.dean.1@ndsu.edu,701-231-9797
"Englund, David",Senior Lecturer,david.englund@ndsu.edu,701-231-6641
"Haakenson, Paulann",Information Processing Specialist,paulann.haakenson@ndsu.edu,701-231-7393
"Hanson, Erik",Assistant Professor,erik.drevlow.hanson@ndsu.edu,701-231-5747
"Haugen, Ron",Farm Management Specialist,ronald.haugen@ndsu.edu,701-231-8103


### Export

In [24]:
tableClean.to_csv("cleanedListOfFaculty.csv")

In [27]:
readCSV = pd.read_csv("cleanedListOfFaculty.csv").set_index("Name")

In [29]:
readCSV.head()

Unnamed: 0_level_0,Position,Email,Phone Numbers
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Bangsund, Dean",Research Scientist,d.bangsund@ndsu.edu,701-231-7471
"Biermacher, Jon",Extension Livestock Development Specialist,jon.biermacher@ndsu.edu,701-231-7379
"Bullock, David",Research Assoc Professor,david.w.bullock@ndsu.edu,701-231-8672
"Carney, Jennifer",Student Services Director,Jennifer.Carney@ndsu.edu,701-231-7442
"Caton, James",Assistant Professor,james.caton@ndsu.edu,701-231-7337
