<h1 style="color:cadetblue; font-size:2em;">Preparing data</h1>

In [1]:
import pandas as pd

In [2]:
# Loading seperate files
dataframe0 = pd.read_csv('datasets/sales-jan-2015.csv')
dataframe1 = pd.read_csv('datasets/sales-feb-2015.csv')

In [4]:
# Using a loop
filenames = ['datasets/sales-jan-2015.csv', 'datasets/sales-feb-2015.csv']
dataframes = []
for f in filenames:
    dataframes.append(pd.read_csv(f))

In [5]:
# Using a comprehension
filenames = ['datasets/sales-jan-2015.csv', 'datasets/sales-feb-2015.csv']
dataframes = [pd.read_csv(f) for f in filenames]

In [6]:
# Using glob
from glob import glob
filenames = glob('sales*.csv')
dataframes = [pd.read_csv(f) for f in filenames]

<h1 style="color:cadetblue; font-size:2em;">Reindexing DataFrames</h1>

In [33]:
import pandas as pd
w_mean = pd.read_csv('datasets/quarterly_mean_temp.csv', index_col='Month')
w_max = pd.read_csv('datasets/quarterly_max_temp.csv', index_col='Month')

In [34]:
print(w_mean)

       Mean TemperatureF
Month                   
Apr            61.956044
Jan            32.133333
Jul            68.934783
Oct            43.434783


In [35]:
print(w_max)

       Max TemperatureF
Month                  
Jan                  68
Apr                  89
Jul                  91
Oct                  84


In [36]:
# The DataFrame indexes
print(w_mean.index)

Index(['Apr', 'Jan', 'Jul', 'Oct'], dtype='object', name='Month')


In [37]:
print(w_max.index)

Index(['Jan', 'Apr', 'Jul', 'Oct'], dtype='object', name='Month')


In [38]:
print(type(w_mean.index))

<class 'pandas.core.indexes.base.Index'>


In [39]:
# Using .reindex()
ordered = ['Jan', 'Apr', 'Jul', 'Oct']
w_mean2 = w_mean.reindex(ordered)
print(w_mean2)

       Mean TemperatureF
Month                   
Jan            32.133333
Apr            61.956044
Jul            68.934783
Oct            43.434783


In [40]:
# Using .sort_index()
w_mean2.sort_index()

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Apr,61.956044
Jan,32.133333
Jul,68.934783
Oct,43.434783


In [41]:
# Reindex from a DataFrame Index
w_mean.reindex(w_max.index)

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Apr,61.956044
Jul,68.934783
Oct,43.434783


In [42]:
# Reindexing with missing labels
w_mean3 = w_mean.reindex(['Jan', 'Apr', 'Dec'])
print(w_mean3)

       Mean TemperatureF
Month                   
Jan            32.133333
Apr            61.956044
Dec                  NaN


In [43]:
# Reindex from a DataFrame Index
w_max.reindex(w_mean3.index)

Unnamed: 0_level_0,Max TemperatureF
Month,Unnamed: 1_level_1
Jan,68.0
Apr,89.0
Dec,


In [44]:
w_max.reindex(w_mean3.index).dropna()

Unnamed: 0_level_0,Max TemperatureF
Month,Unnamed: 1_level_1
Jan,68.0
Apr,89.0


In [45]:
# Order matters
w_max.reindex(w_mean.index)

Unnamed: 0_level_0,Max TemperatureF
Month,Unnamed: 1_level_1
Apr,89
Jan,68
Jul,91
Oct,84


In [46]:
w_mean.reindex(w_max.index)

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Apr,61.956044
Jul,68.934783
Oct,43.434783


<h1 style="color:cadetblue; font-size:2em;">Arithmetic with Series & DataFrames</h1>

In [49]:
# Loading weather data
import pandas as pd
weather = pd.read_csv('datasets/pittsburgh2013.csv', index_col='Date', parse_dates=True)
weather.loc['2013-7-1':'2013-7-7', 'PrecipitationIn']

Date
2013-07-01    0.18
2013-07-02    0.14
2013-07-03    0.00
2013-07-04    0.25
2013-07-05    0.02
2013-07-06    0.06
2013-07-07    0.10
Name: PrecipitationIn, dtype: float64

In [50]:
# Scalar multiplication
weather.loc['2013-07-01':'2013-07-07', 'PrecipitationIn'] * 2.54

Date
2013-07-01    0.4572
2013-07-02    0.3556
2013-07-03    0.0000
2013-07-04    0.6350
2013-07-05    0.0508
2013-07-06    0.1524
2013-07-07    0.2540
Name: PrecipitationIn, dtype: float64

In [51]:
# Absolute temperature range
week1_range = weather.loc['2013-07-01':'2013-07-07', ['Min TemperatureF', 'Max TemperatureF']]
print(week1_range)

            Min TemperatureF  Max TemperatureF
Date                                          
2013-07-01                66                79
2013-07-02                66                84
2013-07-03                71                86
2013-07-04                70                86
2013-07-05                69                86
2013-07-06                70                89
2013-07-07                70                77


In [52]:
# Average temperature
week1_mean = weather.loc['2013-07-01':'2013-07-07', 'Mean TemperatureF']
print(week1_mean)

Date
2013-07-01    72
2013-07-02    74
2013-07-03    78
2013-07-04    77
2013-07-05    76
2013-07-06    78
2013-07-07    72
Name: Mean TemperatureF, dtype: int64


In [53]:
# Relative temperature range
week1_range / week1_mean

  return this.join(other, how=how, return_indexers=return_indexers)


Unnamed: 0_level_0,2013-07-01 00:00:00,2013-07-02 00:00:00,2013-07-03 00:00:00,2013-07-04 00:00:00,2013-07-05 00:00:00,2013-07-06 00:00:00,2013-07-07 00:00:00,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-07-01,,,,,,,,,
2013-07-02,,,,,,,,,
2013-07-03,,,,,,,,,
2013-07-04,,,,,,,,,
2013-07-05,,,,,,,,,
2013-07-06,,,,,,,,,
2013-07-07,,,,,,,,,


In [54]:
week1_range.divide(week1_mean, axis='rows')

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,0.916667,1.097222
2013-07-02,0.891892,1.135135
2013-07-03,0.910256,1.102564
2013-07-04,0.909091,1.116883
2013-07-05,0.907895,1.131579
2013-07-06,0.897436,1.141026
2013-07-07,0.972222,1.069444


In [55]:
# Percentage changes
week1_mean.pct_change() * 100

Date
2013-07-01         NaN
2013-07-02    2.777778
2013-07-03    5.405405
2013-07-04   -1.282051
2013-07-05   -1.298701
2013-07-06    2.631579
2013-07-07   -7.692308
Name: Mean TemperatureF, dtype: float64

In [56]:
# Bronze Olympic medals
bronze = pd.read_csv('datasets/bronze_top5.csv', index_col=0)
print(bronze)

                 Total
Country               
United States   1052.0
Soviet Union     584.0
United Kingdom   505.0
France           475.0
Germany          454.0


In [57]:
# Silver Olympic medals
silver = pd.read_csv('datasets/silver_top5.csv', index_col=0)
print(silver)

                 Total
Country               
United States   1195.0
Soviet Union     627.0
United Kingdom   591.0
France           461.0
Italy            394.0


In [58]:
# Gold Olympic medals
gold = pd.read_csv('datasets/gold_top5.csv', index_col=0)
print(gold)

                 Total
Country               
United States   2088.0
Soviet Union     838.0
United Kingdom   498.0
Italy            460.0
Germany          407.0


In [59]:
# Adding bronze, silver
bronze + silver

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,
Italy,
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [70]:
print(bronze.loc['United States']['Total'])
print('------')
print(silver.loc['United States']['Total'])


1052.0
------
1195.0


In [71]:
# Using the .add() method
bronze.add(silver)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,
Italy,
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [72]:
# Using a fill_value
bronze.add(silver, fill_value=0)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,454.0
Italy,394.0
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [73]:
# Adding bronze, silver, gold
bronze + silver + gold

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,
Germany,
Italy,
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0


In [74]:
# Chaining .add()
bronze.add(silver, fill_value=0).add(gold, fill_value=0)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,861.0
Italy,854.0
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0
