# Pandas (Python programming language for data manipulation and analysis)

## Series
Series are going to be a new datatype that are similar to numpy arrays, as they are one type, but take on a completely different approach to manipulating data.
 - Series have an index attribute, which is the row location of the data
 - Manipulating a Series won't affect the index

In [1]:
import pandas as pd # same as numpy, you'll usually see this as pd
from pandas import Series, DataFrame

# Working with Series

s1 = Series([-2, 1, 0, -1, 2])
print(s1) # elements on the left are the index, also notice it incorporates a dtype (Numpy)

# Similar to a dictionary it has index, values
print("\n**************Index/Values:")
print(s1.index)
print(s1.values)

print("\n**************Defining Indexes:")
s2 = Series([-2, 1, 0, -1, 2], index=['a', 'b', 'c', 'd', 'e'])
print(s2)


print("\n**************Numpy operations maintain the indexes:")
print(s2*3)
print(s2[s2 < 0])


0   -2
1    1
2    0
3   -1
4    2
dtype: int64

**************Index/Values:
RangeIndex(start=0, stop=5, step=1)
[-2  1  0 -1  2]

**************Defining Indexes:
a   -2
b    1
c    0
d   -1
e    2
dtype: int64

**************Numpy operations maintain the indexes:
a   -6
b    3
c    0
d   -3
e    6
dtype: int64
a   -2
d   -1
dtype: int64


 - We can actually define these indexes to make them non-numeric
 - indexes control a lot of the logic on how Series interact with each other

In [5]:
# Actually Series are almost Numpy Dicts

d = {'a': 23, 'f':9, 'b': -21, 'e': 3.0}
s1 = Series(d)
print(s1) # Note it actually sorts the keys

print("\n*************Checking for existence:")
print('a' in s1) # Only works for index
print(23 in s1)
print(23 in s1.values)

print("\n**************Passing in dict and index:")
d = {'a': 23, 'f':9, 'b': -21, 'e': 3.0}
ix = ['a', 'f', 'b', 'd']
s2 = Series(d, index = ix)
print(s2) # 'e' won't be read in, as it doesn't exist in index. Also 'd' is null since no value exists

print("\n**************Adding Series:")
# We can add series together, but if a value doesn't exist in both it's Nan
print(s1 + s2)


a    23.0
f     9.0
b   -21.0
e     3.0
dtype: float64

*************Checking for existence:
True
False
True

**************Passing in dict and index:
a    23.0
f     9.0
b   -21.0
d     NaN
dtype: float64

**************Adding Series:
a    46.0
b   -42.0
d     NaN
e     NaN
f    18.0
dtype: float64


## DataFrames
A dataframe is a table-like structure comprised of a number of Series of data. This means
that the columns can represent different types of data, but each column must be of one type.
This makes a dataframe align with CSV's and Relational Databases very easily.
 - They can be manually generated, imported from CSVs, or loaded from a dictionary

In [3]:
# Dataframes!!

# Can construct a dataframe from a dictionary of lists
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

df = DataFrame(data)
print(df)

print("\n**************Specifiying column order:")
df2 = DataFrame(data, columns = ['year', 'state', 'pop', 'ext']) # This just defines the order of the columns
print(df2)



    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9

**************Specifiying column order:
   year   state  pop  ext
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN


### Dataframe Indexing
Indexing in DataFrames is very different than indexing with lists, instead it is more comparable to dictionary indexing
 - We Select by column or index `df['col_name']` or `df['index_val']`
 - We can select using ints if we use iloc `df.iloc[row, col]`

In [56]:
#Dataframes Indexing

import pandas as pd #same as numpy, you'll usually see this as pd
from pandas import Series, DataFrame

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}


df1 = DataFrame(data, index = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'])
print(df1.index)
print("\n**************Accessing Colums:")
print(df1.columns) #use df.columns to get a list of column names
print(df1['year']) #Like a dictionary we can index into a dataframe's column

print("\n**************Accessing Rows:")
print(df1.index) #use df.index to get a list of index names
print(df1.loc['Ohio']) #using loc gets you a row based on the index label

print("\n**************Accessing Rows2:")
print(df1.iloc[3]) #using loc gets you a row based on the index number

print("\n**************Accessing Rows with Splicing:")
print(df1.iloc[::2]) #When using integer based indexing we can use standard python splicing

print("\n**************Accesing columns with Splicing:")
print(df1.iloc[:, 1])


Index(['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], dtype='object')

**************Accessing Colums:
Index(['state', 'year', 'pop'], dtype='object')
Ohio      2000
Ohio      2001
Ohio      2002
Nevada    2001
Nevada    2002
Name: year, dtype: int64

**************Accessing Rows:
Index(['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], dtype='object')
     state  year  pop
Ohio  Ohio  2000  1.5
Ohio  Ohio  2001  1.7
Ohio  Ohio  2002  3.6

**************Accessing Rows2:
state    Nevada
year       2001
pop         2.4
Name: Nevada, dtype: object

**************Accessing Rows with Splicing:
         state  year  pop
Ohio      Ohio  2000  1.5
Ohio      Ohio  2002  3.6
Nevada  Nevada  2002  2.9

**************Accesing columns with Splicing:
Ohio      2000
Ohio      2001
Ohio      2002
Nevada    2001
Nevada    2002
Name: year, dtype: int64


### In class work

In [55]:
import numpy as np
#Problem 1
"""Create a subset of the data that only contains records where the population is over 2"""
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data)

df_sub = df[df['pop']>2]
#print(df_sub)


#Problem 2
"""Calculate the mean population by year"""

Y_list = df['year'].values

Y = set(Y_list)
Y = list(Y)

Y_dict = dict.fromkeys(Y)
#print(Y_dict)

for year in df['year'].unique():
    mean = np.mean(df[df['year'] == year]['pop'])
    print(mean)

for i in range(len(df['year'])):
    if Y_dict[df['year'][i]]== None:
        Y_dict.update({df['year'][i]:[df['pop'][i]]})
    else:                                  
        Y_dict[df['year'][i]].append(df['pop'][i])


for key in Y_dict:
    print(key)
    print(np.mean(Y_dict[key]))
    
#df.groupby['year'].mean()





1.5
2.05
3.25
2000
1.5
2001
2.05
2002
3.25


### DataFrame Assignment
The similarity between DataFrames and dictionaries continues with value assignment.

In [45]:
import numpy as np

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

df1 = DataFrame(data)

df1['pop'] = 0 #We can quickly alter entire columns with indexing
print(df1)

print("\n**************Creating Columns:")
df1['sq_miles'] = np.random.randint(50, 100, size=df1.shape[0]) # We can create a column by indexing at a new col_name
# We can also get the dimensions of the df with df.shape() -> (rows, columns)
print(df1)


    state  year  pop
0    Ohio  2000    0
1    Ohio  2001    0
2    Ohio  2002    0
3  Nevada  2001    0
4  Nevada  2002    0

**************Creating Columns:
    state  year  pop  sq_miles
0    Ohio  2000    0        90
1    Ohio  2001    0        63
2    Ohio  2002    0        65
3  Nevada  2001    0        76
4  Nevada  2002    0        67


### DataFrame Deletions
With DataFrames to remove data we need to either **drop** or **delete** it.


*Note: a lot of pandas operations have the option to do the operation ***inplace***, otherwise the operation creates a copy*

In [46]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

print("\n**************Deleting Cols:")
df1 = DataFrame(data)
del df1['year'] # Like a dictionary, we can delete a 'key' with the del command
print(df1)

print("\n**************Deleting Rows:")
df1 = DataFrame(data, index = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'])
df2 = df1.drop('Ohio') # Can also do df1.drop('Ohio', inplace=True) which will update df1
print(df2)

print("\n**************Deletion By Index:")
df1 = DataFrame(data, index = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'])
df1 = df1[df1.index != 'Ohio'] # Same effect, but no means to do inplace
print(df1)




**************Deleting Cols:
    state  pop
0    Ohio  1.5
1    Ohio  1.7
2    Ohio  3.6
3  Nevada  2.4
4  Nevada  2.9

**************Deleting Rows:
         state  year  pop
Nevada  Nevada  2001  2.4
Nevada  Nevada  2002  2.9

**************Deletion By Index:
         state  year  pop
Nevada  Nevada  2001  2.4
Nevada  Nevada  2002  2.9


### DataFrame Arithmetic
Very similar to what we saw when working with Series.

In [47]:
df1 = DataFrame(np.random.randint(-10, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'c'])
df2 = DataFrame(np.random.randint(-10, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'd'])

print(df1 + df2) #Just like Series arithemetic, non labeled data ends up NaN
"""Only +, -, *, and / will work with this"""


     a   b   c   d
0  129  -5 NaN NaN
1  118  88 NaN NaN
2   90  15 NaN NaN
3   87  37 NaN NaN


'Only +, -, *, and / will work with this'

### DataFrame Functions - Elementwise
There are two ways that we can apply functions to a dataframe, axis or element-wise
 - Axis based operations act as aggregators and apply a fucnction over columns or rows
 - Element-wise will update each element based on the function

In [48]:
# Element based function applications

df1 = DataFrame(np.random.randint(-100, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'c'])
df2 = DataFrame(np.random.randint(-100, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'd'])

print("\n**************Original:")
print(df1)

print("\n**************ABS:")
print(np.abs(df1)) # We can use any of the numpy ufuncs

print("\n**************Square:")
print(np.square(df1)) # We can use any of the numpy ufuncs


**************Original:
    a   b   c
0 -97  15  60
1  20  96  28
2 -95 -34 -15
3 -25 -90  80

**************ABS:
    a   b   c
0  97  15  60
1  20  96  28
2  95  34  15
3  25  90  80

**************Square:
      a     b     c
0  9409   225  3600
1   400  9216   784
2  9025  1156   225
3   625  8100  6400


### DataFrames Functions - Axis (apply)
 - With DataFrames we can apply functions across the dataframes *axis*.
  - axis=0 -> cols
  - axis=1 -> rows

In [None]:
df1 = DataFrame(np.random.randint(-100, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'c'])
df2 = DataFrame(np.random.randint(-100, 100, size=12).reshape((4, 3)), columns=['a', 'b', 'd'])

"""There are two ways that we can apply functions to a dataframe, axis or element-wise
    - Axis based operations act as aggregators and apply a fucnction over columns or rows
    - Element-wise will update each element based on the function"""

print("\n**************Original:")
print(df1)

# Axis based function applications

print("\n**************Applying Functions, by Col:")
func = lambda x: x.max() - x.min()
print(df1.apply(func))

print("\n**************Applying Functions, by Row:")
func = lambda x: x.max() - x.min()
print(df1.apply(func, axis=1))


# We can also change the type of object formed by changing the output of our func
print("\n**************Generating aggregate data:")
func = lambda x: Series([x.min(), x.max()], index=['min', 'max'])

"""This works because we remove one dimension of the data with the apply() method,
and then create a new dimension from the return value"""

print(df1.apply(func))


### In class work

In [50]:
#Problem 1
"""Given (GDP Deflator = [Nominal GDP/Real GDP] * 100) calculate the real GDP per row
using pandas apply() method and append the results to the current dataframe"""
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'Nominal GDP': [395.1, 398.9, 414.2, 102.2, 105.4],
       'GDP Deflator': [82.59, 84.23, 85.65, 84.23, 85.65]}
df = DataFrame(data)

func = lambda x: x
['Nominal GDP'] *100 / x['GDP Deflator']
df['Real GDP']= df.apply(func, axis = 1)
print(df)

    state  year  Nominal GDP  GDP Deflator    Real GDP
0    Ohio  2000        395.1         82.59  478.387214
1    Ohio  2001        398.9         84.23  473.584234
2    Ohio  2002        414.2         85.65  483.596030
3  Nevada  2001        102.2         84.23  121.334441
4  Nevada  2002        105.4         85.65  123.058961


### Sorting DataFrames
Sorting a DataFames can be done by index or by column. The index sorting or `df.sort_index()` enables us to sort columns or rows by their labels.

In [None]:
df = DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'],
              columns=['d', 'a', 'b', 'c'])
print(df)
print()

# This will sort the indexes of the dataframe
df.sort_index(inplace=True) # Inplace is necessary to update the df object, otherwise need to store output in var
print(df)
print()

# This will sort the columns of the dataframe
df.sort_index(inplace=True, axis=1)
print(df)


Sorting by the values of a DataFrame means to `df.sort_values(by=cols)`. This will sort the dataframe by the columns, in order. So `df.sort_values(by=[col1, col2])` would sort by col1 first, and then deal with any duplicate values in col1 by sorting on the values in col2. 

In [None]:
#Sorting Continued

df = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(df)
print()

df.sort_values(by=['a', 'b'], inplace=True, ascending=False)
print(df) #We should see that the df is sorted by a first, and then b
print()

df.sort_values(by=['b', 'a'], inplace=True, ascending=False)
print(df)
print()


## Pandas Stats
Pandas also enables us to generate stats very easily (from the typical stats functionality)

In [None]:
#Pandas statistics

"""Some methods that exist:
    - count
    - min, max
    - sum
    - mean
    - median
    - mad (mean absolute deviation from mean value)
    - var
    - std
    - skew (skewness of the data 3rd movement) - (1st is mean, 2nd is var, 3rd skew, 4th is kurtosis)
    - kurt (kurtosis 4th movement)
"""

df = DataFrame(np.random.randint(-100, 100, size=20).reshape((5, 4)), columns=['a', 'b', 'c', 'd'])
print(df)
print()

print(df.describe()) # The describe() method gives a good summary of each series in the df
print()

# But we can also get specific stats, by calling their methods
print(df.sum())
print()
print(df.sum(axis=1)) # And with most methods, we can change the axis


### In class work

In [None]:
#Problem 1
"""Given the following dataframe (df) determine which rows have values, for every column,
that fall outside (mean - 1std) for the given column
E.G. For row x values in colummns a, b, c, and d are further than 1 std away from the mean for column a, b, c, and d

Should find rows 24 and 28
"""

np.random.seed(12)
df = DataFrame(np.random.randint(-100, 100, size=200).reshape((50, 4)), columns=['a', 'b', 'c', 'd'])

out_df = df.copy()

for col in df.columns:
    out_df[col] = np.abs(out_df[col] - out_df[col].mean()) > np.abs(out_df[col].mean() - out_df[col].std())

print(out_df[out_df.apply(sum, axis=1) == 4])


### Pandas Correlation and Covariance
This isn't as practical as numpy because it won't be uncommon to have non-numeric data in our dataframes.

In [None]:
#Correlation and Covariance

a = np.arange(10)
b = np.arange(0, 20, 2)
c = np.arange(10)
c.sort()
c = c[::-1]
d = np.random.randint(-10, 10, size=10)

df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})

print(df)
print()

print(df.cov())
print()

print(df.corr())


## Pandas Uniques
We can find unique values by simply calling `df[col].unique()`. This is because only Series have a unique method.

Similarly we can call `df[col].value_counts()` to determine how frequent each value in a Series is.

In [None]:
#Uniques

df = DataFrame(np.random.randint(-10, 10, size=100).reshape((25, 4)), columns=['a', 'b', 'c', 'd'])
print(df['a'].unique()) #Unique method only exists for Series data, not DataFrames
print()

print(df['a'].value_counts().sort_index()) #value_counts gives us a count of values in a Series


## isin()
Just like python's x in y functionality, we can do the same thing usuing `df[col].isin(lst)`

In [None]:
#isin() method

df = DataFrame(np.random.randint(-10, 10, size=100).reshape((25, 4)), columns=['a', 'b', 'c', 'd'])

# isin() helps us identify indexes where values of interest lie
# It generates a true/false vector for use with subsetting
for ix in range(-10, 10):
    val_in = df['a'].isin([ix])
    print("Rows with {} in:\n{}\n".format(ix, df['a'][val_in]))
