In [17]:
import pandas as pd
import numpy as np


In [18]:

# Pandas Series creation and indexing 

step_data = [360,7891, 9761, 3907, 4338, 5373]

step_counts = pd.Series(step_data, name="steps")

print(step_counts)

0     360
1    7891
2    9761
3    3907
4    4338
5    5373
Name: steps, dtype: int64


In [19]:
# add date range to a series 

step_counts.index = pd.date_range('20150329', periods=6)

print(step_counts)

2015-03-29     360
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


In [20]:
# select data by index values 
# just like a dict

print(step_counts['2015-04-01'])

# or by indexing position like in arrays 
print(step_counts[3])

# select all of april
print(step_counts['2015-04'])

3907
3907
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


In [21]:
# Data types viewing and converting

#view a data type
print(step_counts.dtypes)

# convert to a float 
step_counts = step_counts.astype(np.float)

int64


AttributeError: module 'numpy' has no attribute 'float'

In [None]:
# DataFrames can be created from lists, dicts, pandas series 

# clycling distance 
cycling_data = [10.7, 0, None , 2.4, 15.3, 10.9, None]

# creating a tuple of data
joined_data = list(zip(step_data, cycling_data))

# the data frame 
activity_df = pd.DataFrame(joined_data)

print(activity_df)

      0     1
0   360  10.7
1  7891   0.0
2  9761   NaN
3  3907   2.4
4  4338  15.3
5  5373  10.9


In [None]:
# label columns and an index can be added

# add a column name to dataframe
activity_df = pd.DataFrame(
    joined_data,
    index = pd.date_range('20150329', periods=6),
    columns=['Walking', 'Cycling']
)

print(activity_df)

            Walking  Cycling
2015-03-29      360     10.7
2015-03-30     7891      0.0
2015-03-31     9761      NaN
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


In [None]:
# Dataframe rows can be indexted by row using 'loc' and 'iloc' methods 
print(activity_df.loc['2015-04-01'])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


In [None]:
# select row of data by integer position
print(activity_df.iloc[-3])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


In [None]:
# DataFrame columns can be indexed by name 
print(activity_df['Walking'])

2015-03-29     360
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [None]:
# dataframes columns can also be indexed as properties
print(activity_df.Walking)

2015-03-29     360
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [None]:
#data frames can be indexed by integer
print(activity_df.iloc[:,0])

2015-03-29     360
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [None]:
# read data from pandas 

#the location of data file
filepath = '../data/Iris_Data.csv'

#import the data
data = pd.read_csv(filepath)

#print few columns
print(data.iloc[:5])


   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [None]:
# assigning new data to a data frame

#create a new column that is a product of both measurements
data['sepal_area'] = data.sepal_length * data.sepal_width

#print a few rows and columns
print(data.iloc[:5, -3:])

   petal_width      species  sepal_area
0          0.2  Iris-setosa       17.85
1          0.2  Iris-setosa       14.70
2          0.2  Iris-setosa       15.04
3          0.2  Iris-setosa       14.26
4          0.2  Iris-setosa       18.00


In [None]:
# appying a function to a data frame column  

#the lambda function apples what follows it to each row of data 
data['abbrev'] = (data.species.apply(lambda x: x.replace('Iris-','')))

print(data.iloc[:5, -3:])

       species  sepal_area  abbrev
0  Iris-setosa       17.85  setosa
1  Iris-setosa       14.70  setosa
2  Iris-setosa       15.04  setosa
3  Iris-setosa       14.26  setosa
4  Iris-setosa       18.00  setosa


In [22]:
# Two dataframes can be concatenated along either dimensions 

# concatenate the first two and last 2 rows 
small_data = pd.concat([data.iloc[:2], data.iloc[-2:]])

print(small_data.iloc[:,-3:])

            species  sepal_area     abbrev
0       Iris-setosa       17.85     setosa
1       Iris-setosa       14.70     setosa
148  Iris-virginica       21.08  virginica
149  Iris-virginica       17.70  virginica


In [23]:
# Aggregating Statics with groupby
# use the size method with a dataframe to get count 
# for a series , use the .value_counts method
group_sizes = (data.groupby('species').size())

print(group_sizes)

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [28]:
# Performing statistical calculations - mean, mediam, mode 

#mean calculated on a dataframe
print(data.mean())

#median calculated on a series
print(data.petal_length.median())

# mode calculated on a series 
print(data.petal_length.mode())

sepal_length     5.843333
sepal_width      3.054000
petal_length     3.758667
petal_width      1.198667
sepal_area      17.806533
dtype: float64
4.35
0    1.5
Name: petal_length, dtype: float64


  print(data.mean())


In [31]:
#standard dev, variance, and SEM
print(
data.petal_length.std(), 
data.petal_length.var(), 
data.petal_length.sem())

#quatiles
print(data.quantile(0))

1.7644204199522626 3.113179418344519 0.1440643240210085
sepal_length     4.3
sepal_width      2.0
petal_length     1.0
petal_width      0.1
sepal_area      10.0
Name: 0.0, dtype: float64


  print(data.quantile(0))


In [33]:
# performing statistical calculations 
# multiple calculations can be presented in a dataframe 

print(data.describe())

       sepal_length  sepal_width  petal_length  petal_width  sepal_area
count    150.000000   150.000000    150.000000   150.000000  150.000000
mean       5.843333     3.054000      3.758667     1.198667   17.806533
std        0.828066     0.433594      1.764420     0.763161    3.368693
min        4.300000     2.000000      1.000000     0.100000   10.000000
25%        5.100000     2.800000      1.600000     0.300000   15.645000
50%        5.800000     3.000000      4.350000     1.300000   17.660000
75%        6.400000     3.300000      5.100000     1.800000   20.325000
max        7.900000     4.400000      6.900000     2.500000   30.020000


In [35]:
# sampling from dataframes 
# dataframes can be randomly sampled 

# sample 5 rows without replacement
sample = data.sample(n=5 , replace = False, random_state=42)

print(sample.iloc[:,-3:])

             species  sepal_area      abbrev
73   Iris-versicolor       17.08  versicolor
18       Iris-setosa       21.66      setosa
118   Iris-virginica       20.02   virginica
78   Iris-versicolor       17.40  versicolor
76   Iris-versicolor       19.04  versicolor
