In [1]:
import pandas as pd
import numpy as np

In [2]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

obj3 = pd.Series(sdata)

In [3]:
'Ohio' in obj3.keys()

True

In [4]:
35000 in obj3.values

True

In [5]:
obj3.index

Index(['Ohio', 'Oregon', 'Texas', 'Utah'], dtype='object')

In [6]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

obj4 = pd.Series(sdata, index=states)

In [7]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

## Checking for Nan

np.nan in numpy

In [8]:
pd.isnull(obj4) # also pd.notnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [9]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [10]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

## Dataframe

In [11]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [12]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


## dtypes

In [13]:
frame.dtypes

pop      float64
state     object
year       int64
dtype: object

### Enforce dtypes when creating dataframes

In [14]:
df = pd.DataFrame(data={'a' : np.array([1,2,3], dtype=np.int32),
                        'b' : np.array([4,5,6], dtype=np.float64)
                       }
                 )

In [15]:
df.dtypes

a      int32
b    float64
dtype: object

### Change dtypes after creating dataframes

In [16]:
df['b'].astype(np.int32)

0    4
1    5
2    6
Name: b, dtype: int32

In [17]:
df.dtypes

a      int32
b    float64
dtype: object

### Change dtypes from string to numeric

In [18]:
data = pd.DataFrame(data = {'one':['1','2','3'], 'two':['1.2','1.3','1.2']})

In [19]:
cols = ['one','two']
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [20]:
data.dtypes

one    float64
two    float64
dtype: object

## Delete a column
del and drop

In [21]:
# drop make a copy

frame['add'] = frame.state == 'Ohio'

# not working : del frame.add
frame.drop(['add'], axis =1)

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [22]:
frame

Unnamed: 0,pop,state,year,add
0,1.5,Ohio,2000,True
1,1.7,Ohio,2001,True
2,3.6,Ohio,2002,True
3,2.4,Nevada,2001,False
4,2.9,Nevada,2002,False
5,3.2,Nevada,2003,False


In [23]:
# del deletes globally
# not working : del frame.add

del frame['add']

In [24]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


## rename a column

In [25]:
## rename all the columns
frame.columns = ['popnew', 'statenew', 'yearnew']

In [26]:
frame.columns

Index(['popnew', 'statenew', 'yearnew'], dtype='object')

In [27]:
## rename one column 

frame.rename(columns={'popnew':'log(gdp)'}, inplace=True)

In [28]:
frame

Unnamed: 0,log(gdp),statenew,yearnew
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003
