## The DataFrame

In [104]:
import pandas as pd

In [105]:
df = pd.read_json('data/nobel_winners_dirty.json')

In [106]:
df_orig = df.copy()

## Indices

In [107]:
df.columns

Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')

In [108]:
df.index

RangeIndex(start=0, stop=1052, step=1)

In [109]:
df = df.set_index('name')

In [110]:
print(df.loc['Albert Einstein'])

                born_in category      country date_of_birth date_of_death  \
name                                                                        
Albert Einstein          Physics  Switzerland    1879-03-14    1955-04-18   
Albert Einstein          Physics      Germany    1879-03-14    1955-04-18   

                gender                                          link  \
name                                                                   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   

                                           place_of_birth  \
name                                                        
Albert Einstein  Ulm ,  Baden-Württemberg , German Empire   
Albert Einstein  Ulm ,  Baden-Württemberg , German Empire   

                               place_of_death  \
name                                            
Albert Einstein  Princeton, New Jersey , U.S.   
Albert Ein

In [111]:
# return the index to original integer-based state
df = df.reset_index()

## Rows and Columns

In [112]:
df.iloc[2]

name                                              Vladimir Prelog *
born_in                                      Bosnia and Herzegovina
category                                                  Chemistry
country                                                            
date_of_birth                                         July 23, 1906
date_of_death                                            1998-01-07
gender                                                         male
link                   http://en.wikipedia.org/wiki/Vladimir_Prelog
place_of_birth    Sarajevo ,  Bosnia and Herzegovina , then part...
place_of_death                                Zürich ,  Switzerland
text              Vladimir Prelog *,  born in then  Austria–Hung...
year                                                           1975
Name: 2, dtype: object

In [113]:
gender_col = df.gender # or df['gender']
type(gender_col)

pandas.core.series.Series

In [114]:
gender_col.head()

0    male
1    male
2    male
3    None
4    male
Name: gender, dtype: object

## Selecting Groups

In [115]:
cat_groups = df.groupby('category')
cat_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc069f0df60>

In [116]:
cat_groups.groups.keys()

dict_keys(['', 'Chemistry', 'Economics', 'Literature', 'Peace', 'Physics', 'Physiology or Medicine'])

In [117]:
phy_group = cat_groups.get_group('Physics')
print(phy_group.head())

                 name born_in category  country    date_of_birth  \
13   François Englert          Physics  Belgium  6 November 1932   
19         Niels Bohr          Physics  Denmark   7 October 1885   
23  Ben Roy Mottelson          Physics  Denmark     July 9, 1926   
24          Aage Bohr          Physics  Denmark     19 June 1922   
47     Alfred Kastler          Physics   France       3 May 1902   

       date_of_death gender  \
13                     male   
19  18 November 1962   male   
23                     male   
24  8 September 2009   male   
47    7 January 1984   male   

                                                 link  \
13  http://en.wikipedia.org/wiki/Fran%C3%A7ois_Eng...   
19            http://en.wikipedia.org/wiki/Niels_Bohr   
23     http://en.wikipedia.org/wiki/Ben_Roy_Mottelson   
24             http://en.wikipedia.org/wiki/Aage_Bohr   
47        http://en.wikipedia.org/wiki/Alfred_Kastler   

                      place_of_birth         place_of_death  

In [118]:
df.category == 'Physics'

0       False
1       False
2       False
3       False
4       False
        ...  
1047     True
1048    False
1049    False
1050    False
1051    False
Name: category, Length: 1052, dtype: bool

In [119]:
print(df[df.category == 'Physics'])

                          name    born_in category    country  \
13            François Englert             Physics    Belgium   
19                  Niels Bohr             Physics    Denmark   
23           Ben Roy Mottelson             Physics    Denmark   
24                   Aage Bohr             Physics    Denmark   
47              Alfred Kastler             Physics     France   
...                        ...        ...      ...        ...   
1022       Victor Francis Hess             Physics    Austria   
1025            Wolfgang Pauli             Physics    Austria   
1035  William Lawrence Bragg *  Australia  Physics              
1039  Aleksandr M. Prokhorov *  Australia  Physics              
1047          Brian P. Schmidt             Physics  Australia   

          date_of_birth     date_of_death gender  \
13      6 November 1932                     male   
19       7 October 1885  18 November 1962   male   
23         July 9, 1926                     male   
24         

## Creating and Saving DataFrames

In [120]:
df = pd.DataFrame({
'name': ['Albert Einstein', 'Marie Curie',\
'William Faulkner'],
'category': ['Physics', 'Chemistry', 'Literature']
})
df

Unnamed: 0,name,category
0,Albert Einstein,Physics
1,Marie Curie,Chemistry
2,William Faulkner,Literature


In [121]:
df = pd.DataFrame.from_dict([
{'name': 'Albert Einstein', 'category':'Physics'},
{'name': 'Marie Curie', 'category':'Chemistry'},
{'name': 'William Faulkner', 'category':'Literature'}
])
df

Unnamed: 0,name,category
0,Albert Einstein,Physics
1,Marie Curie,Chemistry
2,William Faulkner,Literature


In [122]:
df.head()

Unnamed: 0,name,category
0,Albert Einstein,Physics
1,Marie Curie,Chemistry
2,William Faulkner,Literature


## CSV

In [123]:
from io import StringIO

data = "`Albert Einstein`| Physics \n`Marie Curie`| Chemistry"
df = pd.read_csv(StringIO(data), sep='|', names=['name', 'category'], skipinitialspace=True, quotechar="`")
df

Unnamed: 0,name,category
0,Albert Einstein,Physics
1,Marie Curie,Chemistry


## Excel Files

In [124]:
!pip install openpyxl



In [125]:
dfs = {}
xls = pd.ExcelFile('data/nobel_winners.xlsx') # load Excel file
dfs['WinnersSheet1'] = xls.parse('WinnersSheet1', na_values=['NA'])
dfs['WinnersSheet2'] = xls.parse('WinnersSheet2',index_col=1, na_values=['-'], skiprows=3)

In [1]:
dfs['WinnersSheet1'].head()

NameError: name 'dfs' is not defined

In [127]:
dfs = pd.read_excel('data/nobel_winners.xlsx', ['WinnersSheet1','WinnersSheet2'],
index_col=None, na_values=['NA'])

In [128]:
print(dfs['WinnersSheet2'].head())

     category             nationality  year                name  gender
0       Peace                American  1906  Theodore Roosevelt    male
1  Literature           South African  1991     Nadine Gordimer  female
2   Chemistry  Bosnia and Herzegovina  1975     Vladamir Prelog    male


In [129]:
# Load the first excel sheet into a DataFrame
df = pd.read_excel('data/nobel_winners.xlsx')
df

Unnamed: 0,category,nationality,year,name,gender
0,Peace,American,1906,Theodore Roosevelt,male
1,Literature,South African,1991,Nadine Gordimer,female
2,Chemistry,Bosnia and Herzegovina,1975,Vladamir Prelog,male


In [130]:
df1 = dfs['WinnersSheet1']
df2 = dfs['WinnersSheet2']

In [131]:
df1.to_excel('data/nobel_winners_sheet1.xlsx', sheet_name='WinnersSheet1')

In [132]:
with pd.ExcelWriter('data/nobel_winners_all_sheets.xlsx') as writer:
    df1.to_excel(writer, sheet_name='WinnersSheet1')
    df2.to_excel(writer, sheet_name='WinnersSheet2')

## SQL

In [133]:
import sqlalchemy

engine = sqlalchemy.create_engine('sqlite:///data/nobel_winners.db')
df = pd.read_sql('winners', engine)
print(df)

   id   category             name nationality  year  gender
0   1    Physics  Albert Einstein       Swiss  1921    male
1   2    Physics       Paul Dirac     British  1933    male
2   3  Chemistry      Marie Curie      Polish  1911  female


In [134]:
# We can also use an SQL query to retrieve the table
import sqlite3

conn = sqlite3.connect('data/nobel_winners.db')
winners_string = '''SELECT * 
                    FROM winners'''
df_winners = pd.read_sql(winners_string,
                       conn,)
                      # index_col='index')
conn.close()
df_winners.head()

Unnamed: 0,id,category,name,nationality,year,gender
0,1,Physics,Albert Einstein,Swiss,1921,male
1,2,Physics,Paul Dirac,British,1933,male
2,3,Chemistry,Marie Curie,Polish,1911,female


In [135]:
# add a copy of the winners' table to our sqlite database
df_winners.to_sql('winners_copy', engine, if_exists='replace')

3

## MongoDB

In [136]:
import pandas as pd
from pymongo import MongoClient

client = MongoClient() 

db = client.nobel_prize 
db.drop_collection('winners')
cursor = db.winners.find()
df = pd.DataFrame(list(cursor)) 
df

In [137]:
from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost',\
                       port=27017, username=None, password=None):
    """ Get named database from MongoDB with/out authentication """
    # make Mongo connection with/out authentication
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s/%s'%\
        (username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)

    return conn[db_name]

In [138]:
def mongo_to_dataframe(db_name, collection, query={},\
                       host='localhost', port=27017,\
                       username=None, password=None,\
                        no_id=True):
    """ create a dataframe from mongodb collection """

    db = get_mongo_database(db_name, host, port, username,\
     password)
    cursor = db[collection].find(query)
    df =  pd.DataFrame(list(cursor))
    if no_id: 
        del df['_id']
    return df

def dataframe_to_mongo(df, db_name, collection,\
                       host='localhost', port=27017,\
                       username=None, password=None):
    """ save a dataframe to mongodb collection """
    db = get_mongo_database(db_name, host, port, username,\
     password)
    records = df.to_dict('records')
    db[collection].insert_many(records)

In [139]:
dataframe_to_mongo(df_winners, 'nobel_prize', 'winners')

In [140]:
db = get_mongo_database('nobel_prize')
list(db.winners.find())

[{'_id': ObjectId('62fcf86f0e7fe50ac4393920'),
  'id': 1,
  'category': 'Physics',
  'name': 'Albert Einstein',
  'nationality': 'Swiss',
  'year': 1921,
  'gender': 'male'},
 {'_id': ObjectId('62fcf86f0e7fe50ac4393921'),
  'id': 2,
  'category': 'Physics',
  'name': 'Paul Dirac',
  'nationality': 'British',
  'year': 1933,
  'gender': 'male'},
 {'_id': ObjectId('62fcf86f0e7fe50ac4393922'),
  'id': 3,
  'category': 'Chemistry',
  'name': 'Marie Curie',
  'nationality': 'Polish',
  'year': 1911,
  'gender': 'female'}]

## Series into DataFrames

In [141]:
s = pd.Series([1, 2, 3, 4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [142]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [143]:
s = pd.Series({'a':1, 'b':2, 'c':3})
s

a    1
b    2
c    3
dtype: int64

In [144]:
s = pd.Series({'a':1, 'b':2}, index=['a', 'b', 'c'])
s

a    1.0
b    2.0
c    NaN
dtype: float64

In [145]:
s = pd.Series({'a':1, 'b':2, 'c':3}, index=['a', 'b'])
s

a    1
b    2
dtype: int64

In [146]:
pd.Series(9, {'a', 'b', 'c'})

a    9
c    9
b    9
dtype: int64

In [147]:
import numpy as np

s = pd.Series([1, 2, 3, 4], ['a', 'b', 'c', 'd'])
np.sqrt(s)

a    1.000000
b    1.414214
c    1.732051
d    2.000000
dtype: float64

In [148]:
s[1:3]

b    2
c    3
dtype: int64

In [149]:
pd.Series([1, 2.1, 'foo']) + pd.Series([2, 3, 'bar'])

0         3
1       5.1
2    foobar
dtype: object

In [150]:
names = pd.Series(['Albert Einstein', 'Marie Curie'], name='name')
categories = pd.Series(['Physics', 'Chemistry'], name='category')
df = pd.concat([names, categories], axis=1)
df.head()

Unnamed: 0,name,category
0,Albert Einstein,Physics
1,Marie Curie,Chemistry
