# 25 Pandas Tricks

In [6]:
import pandas as pd
import numpy as np

<h4>1 - Show pandas version</h4>

In [2]:
pd.__version__

'0.22.0'

In [3]:
# to show pandas dependencies version
pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.6.4.final.0
python-bits: 64
OS: Windows
OS-release: 8.1
machine: AMD64
processor: Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None

pandas: 0.22.0
pytest: 3.3.2
pip: 19.1
setuptools: 38.4.0
Cython: 0.27.3
numpy: 1.16.3
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.2.1
sphinx: 1.6.6
patsy: 0.5.0
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.4
feather: None
matplotlib: 2.1.2
openpyxl: 2.4.10
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.2
lxml: 4.1.1
bs4: 4.6.0
html5lib: 0.9999999
sqlalchemy: 1.2.1
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: 0.6.0


<h4>2 - Create example Dataframe</h4>

In [4]:
# 1st approach
df = pd.DataFrame({'col_1': [1, 2], 'col_2': [3, 4]})

In [5]:
df

Unnamed: 0,col_1,col_2
0,1,3
1,2,4


In [10]:
# 2nd approach
df = pd.DataFrame(np.random.rand(4, 8))

In [11]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.167255,0.309563,0.726674,0.027462,0.397916,0.011975,0.721583,0.554896
1,0.404984,0.976841,0.801875,0.387022,0.290129,0.453034,0.996532,0.539585
2,0.775633,0.268406,0.722867,0.183203,0.295927,0.571763,0.692545,0.514515
3,0.798204,0.191955,0.787879,0.626682,0.419772,0.900318,0.702987,0.605278


In [13]:
# 3rd appraoch to get rid of numeric column names
df = pd.DataFrame(np.random.rand(4, 8), columns=list('abcdefgh'))

In [14]:
df

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.776078,0.983917,0.079073,0.772274,0.87171,0.946796,0.164049,0.318552
1,0.619791,0.812619,0.372781,0.82859,0.656757,0.629156,0.645358,0.382333
2,0.984769,0.17237,0.592242,0.260396,0.025028,0.09749,0.30047,0.825898
3,0.28194,0.872758,0.35365,0.462969,0.766828,0.545482,0.49765,0.586664


<h4>3 - Rename columns</h4>

In [24]:
df = pd.DataFrame(np.random.rand(4, 2), columns=['col 1', 'col 2'])

In [25]:
df

Unnamed: 0,col 1,col 2
0,0.145441,0.728195
1,0.775665,0.521138
2,0.104177,0.256863
3,0.168555,0.256248


In [26]:
# remove space within column names
# 1st approach
df1 = df.rename({'col 1': 'col_1', 'col 2': 'col_2'}, axis='columns')

In [27]:
df1

Unnamed: 0,col_1,col_2
0,0.145441,0.728195
1,0.775665,0.521138
2,0.104177,0.256863
3,0.168555,0.256248


In [30]:
# 2nd approach
df.columns = df.columns.str.replace(' ', '_')

In [31]:
df

Unnamed: 0,col_1,col_2
0,0.145441,0.728195
1,0.775665,0.521138
2,0.104177,0.256863
3,0.168555,0.256248


In [34]:
# 3rd approach, by overriding column values
df = pd.DataFrame(np.random.rand(4, 2), columns=['col 1', 'col 2'])

In [35]:
df

Unnamed: 0,col 1,col 2
0,0.074903,0.265241
1,0.391024,0.164798
2,0.770211,0.219412
3,0.738951,0.097071


In [36]:
df.columns = ['col_1', 'col_2']

In [37]:
df

Unnamed: 0,col_1,col_2
0,0.074903,0.265241
1,0.391024,0.164798
2,0.770211,0.219412
3,0.738951,0.097071


In [40]:
# to add prefix or suffix
df.add_prefix('X_')

Unnamed: 0,X_col_1,X_col_2
0,0.074903,0.265241
1,0.391024,0.164798
2,0.770211,0.219412
3,0.738951,0.097071


In [41]:
df.add_suffix('_Y')

Unnamed: 0,col_1_Y,col_2_Y
0,0.074903,0.265241
1,0.391024,0.164798
2,0.770211,0.219412
3,0.738951,0.097071


<h4>4 - Reverse Row Order</h4>

In [42]:
df = pd.read_csv('http://bit.ly/drinksbycountry')

In [43]:
df.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [49]:
# reverse row order
df.loc[::-1].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia
189,Vietnam,111,2,1,2.0,Asia
188,Venezuela,333,100,3,7.7,South America


In [54]:
# reset index and remove old one
df.loc[::-1, :].reset_index(drop=True)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia
3,Vietnam,111,2,1,2.0,Asia
4,Venezuela,333,100,3,7.7,South America
5,Vanuatu,21,18,11,0.9,Oceania
6,Uzbekistan,25,101,8,2.4,Asia
7,Uruguay,115,35,220,6.6,South America
8,USA,249,158,84,8.7,North America
9,Tanzania,36,6,1,5.7,Africa


<h4>5 - Reverse Column Order</h4>

In [57]:
df.loc[:, ::-1].head()

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria
3,Europe,12.4,312,138,245,Andorra
4,Africa,5.9,45,57,217,Angola


<h4>6 - Select Columns by Data Type</h4>

In [58]:
drinks = pd.read_csv("http://bit.ly/drinksbycountry")

In [60]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [63]:
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [65]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [67]:
drinks.select_dtypes(include=['number', 'object']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [68]:
drinks.select_dtypes(exclude='number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


<h4>7 - Convert Strings to Numeric</h4>

In [74]:
df = pd.DataFrame({'col 1': ['1.2', '3.2'],
                  'col 2': ['4.5', '5.4'],
                  'col 3': ['5.4', '-']})

In [75]:
df

Unnamed: 0,col 1,col 2,col 3
0,1.2,4.5,5.4
1,3.2,5.4,-


In [76]:
df.dtypes

col 1    object
col 2    object
col 3    object
dtype: object

In [85]:
df.astype({'col 1': 'float', 'col 2': 'float'}).dtypes

col 1    float64
col 2    float64
col 3     object
dtype: object

In [94]:
# col 3 contains '-', that's why we can't convert it directly into numeric data type
pd.to_numeric(df['col 3'], errors='coerec').fillna(0)

0    5.4
1    0.0
Name: col 3, dtype: float64

In [102]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [103]:
df.dtypes

col 1    float64
col 2    float64
col 3    float64
dtype: object

In [104]:
df

Unnamed: 0,col 1,col 2,col 3
0,1.2,4.5,5.4
1,3.2,5.4,0.0
