## Pandas Tutorial

### Series

In [1]:
import numpy as np
import pandas as pd

In [3]:
# Pandas uses something called a dataframe. It is a 
# 2D data structure that can hold multiple data types.
# Columns have labels.

# Series are built on top of NumPy arrays. 
# Create a series by first creating a list
list_1 = ['a', 'b', 'c', 'd']
labels = [1,2,3,4]

# I can define that I want the series indexes to be the provided labels
ser_1 = pd.Series(data = list_1, index = labels)
print(ser_1)

1    a
2    b
3    c
4    d
dtype: object


In [5]:
arr_1 = np.array([1,2,3,4])
ser_2 = pd.Series(arr_1)

dict_1 = {'f_name': "Derek", "l_name": "Banas", "age": 44}
ser_3 = pd.Series(dict_1)
print(ser_3['f_name'])

Derek


In [7]:
# You can get the datatype
print(ser_2.dtype)

int32


In [8]:
# You can perform math operations on series
print(ser_2 + ser_2)
ser_2 - ser_2
ser_2 * ser_2
ser_2 / ser_2

0    2
1    4
2    6
3    8
dtype: int32


0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

In [9]:
# You can assign names to series
ser_4 = pd.Series({8: 9, 9: 10}, name='rand_nums')
ser_4.name

'rand_nums'

### DataFrames

#### Creating DataFrames

In [24]:
from numpy import random

# Create random matrix 2x3 with values between 10 and 50
arr_2 = np.random.randint(10, 50, size=(2, 3))
print(arr_2)

# Create DF with data, row labels & column labels
df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])
print(df_1)

[[27 43 13]
 [15 10 35]]
    C   D   E
A  27  43  13
B  15  10  35


   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0
   A  B
0  1  4
1  2  5
2  3  6
   one  two  three
A    1    2      3
B    4    5      6
(2, 3)


In [None]:
# Create a DF from multiple series in a dict
# If series are of different lengthes extra spaces are NaN
dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df_2 = pd.DataFrame(dict_3)
print(df_2)

# from_dict accepts a column labels and lists
print(pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])])))


# You can assign the keys as row labels and column labels separate
# with orient='index'
print(pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]),
                      orient='index', columns=['one','two','three']))


# Get number of rows and columns as tuple
print(df_1.shape)

### Editing and Retreiving Data

In [20]:
# Grab a column
print(df_1['C'])
# Get multiple columns
df_1[['C', 'E']]

# Grabb a row as a series
print(df_1.loc['A'])
# Grab row by index position
print(df_1.iloc[1])

# Grab multiple cells by defining rows wanted & the
# columns from those rows
print(df_1.loc[['A', 'B'], ['D', 'E']])

A    49
B    35
Name: C, dtype: int32
C    49
D    14
E    20
Name: A, dtype: int32
C    35
D    16
E    34
Name: B, dtype: int32
    D   E
A  14  20
B  16  34


In [23]:
# Make new column
df_1['Total'] = df_1['C'] + df_1['D'] + df_1['E']
df_1

# You can perform multiple calculations
df_2['mult'] = df_2['one'] * df_2['two']
df_2

# Make a new row by appending
dict_2 = {'C': 44, 'D': 45, 'E': 46}
new_row = pd.Series(dict_2, name='F')
df_1 = df_1.append(new_row)

# Delete column and set inplace to True which is required
# because Pandas tries to help you not delete data
# by accident
# df_1.drop('Total', axis=1, inplace=True)
# df_1
# # Delete a row
# df_1.drop('B', axis=0, inplace=True)
# df_1

Unnamed: 0,C,D,E
A,49.0,14.0,20.0
F,44.0,45.0,46.0


In [26]:
# Create a new column and make it the index
df_1['Sex'] = ['Men', 'Women']
df_1.set_index('Sex', inplace=True)
print(df_1)

        C   D   E
Sex              
Men    27  43  13
Women  15  10  35


In [28]:
# You can reset index values to numbers
#df_1.reset_index(inplace=True)
df_1

# Assign can be used to create a column while leaving the
# original DF untouched
df_2.assign(div=df_2['one'] / df_2['two'])

# You can pass in a function as well
df_2.assign(div=lambda x: (x['one'] / x['two']))


Unnamed: 0,one,two,mult,div
a,1.0,1.0,1.0,1.0
b,2.0,2.0,4.0,1.0
c,3.0,3.0,9.0,1.0
d,,4.0,,


In [29]:
# Combine DataFrames while keeping df_3 data unless
# there is a NaN value
df_3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})
df_4 = pd.DataFrame({'A': [8., 9., 2., 4.]})
df_3.combine_first(df_4)

Unnamed: 0,A
0,1.0
1,9.0
2,3.0
3,4.0


### Conditional Selection

In [37]:
arr_2 = np.random.randint(10, 50, size=(2, 3))
df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])
print(df_1)

# You can use conditional operators to retrieve a table based on the condition
print("Greater than 40\n", df_1 > 40.0)

# You can use comparison operater functions as well like
# gt, lt, ge, le, eq, ne
print("Greater than 45\n", df_1.gt(45.0))

# You can place conditions in brackets as well
bool_1 = df_1 >= 45.0
df_1[bool_1]

# Get bools for a column
df_1['E'] > 40

# Return a row if cell value in column matches a condition
df_1[df_1['E']>30]


    C   D   E
A  18  36  35
B  33  49  45
Greater than 40
        C      D      E
A  False  False  False
B  False   True   True
Greater than 45
        C      D      E
A  False  False  False
B  False   True  False


Unnamed: 0,C,D,E
A,18,36,35
B,33,49,45


In [40]:
# You can focus on a column based on resulting dataframe
df_2 = df_1[df_1['E']>30]
df_2['C']


A    18
B    33
Name: C, dtype: int32

In [41]:
# You can stack these commands
print(df_1[df_1['E']>20]['C'])
print()


A    18
B    33
Name: C, dtype: int32



In [43]:
# You can use multiple conditions
arr_3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_2 = pd.DataFrame(arr_3, ['A', 'B', 'C'], ['X', 'Y', 'Z'])
print(df_2, "\n")
# You can use or | to combine conditions as well
df_2[(df_2['X']>3) & (df_2['X']<7)]

df_2[(df_2['X']>3) | (df_2['X']<7)]


   X  Y  Z
A  1  2  3
B  4  5  6
C  7  8  9 



Unnamed: 0,X,Y,Z
A,1,2,3
B,4,5,6
C,7,8,9


### File Input/Output

In [46]:
# import pymysql

# Read a CSV file
# Type pd.read_ [TAB] to see the file types you can read
cs_df = pd.read_csv('ComputerSales.csv')


### Basics & Math

In [47]:
# Display 1st 5 rows
cs_df.head()
# Display last 5 rows
cs_df.tail()
# Get 1st 2
cs_df[:2]
# Get 1st through 5 with a 2 step
cs_df[:5:2]

# Get indexes
cs_df.index.array
# Get NumPy array
cs_df.to_numpy()
# Get array from series
ser_1.array

dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df_2 = pd.DataFrame(dict_3)

# You can replace NaN values with 0 or anything else
print(df_2.fillna(0))
# Get values in row 2
row = df_2.iloc[1]
# Add items in row 2 to all rows including row 2
# You can do the same with sub, mul, and div
df_2.add(row, axis='columns')

# Get column 2
col = df_2['two']
# Subtract from other columns
df_2.sub(col, axis=0)

# Check if empty
df_2.empty

# Transform executes a function on a dataframe
df_5 = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
df_5.transform(lambda x: x+1)
df_5.transform(lambda x: x**2)
df_5.transform(lambda x: np.sqrt(x))
# You can transform using multiple functions
df_5.transform([lambda x: x**2, lambda x: x**3])
# Passing a dictionary allows you to perform different calculations
# on different columns
df_5.transform({'A': lambda x: x**2, 'B': lambda x: x**3})

# map performs a function on a series
df_5['A'].map(lambda x: x**2)

# applymap does the same on a dataframe
df_5.applymap(lambda x: x**2)

# Get unique values in column 2 of DF
df_2['two'].unique()

# Get number of uniques
df_2['two'].nunique()

# Get the number of times each value showed in column 2
df_2['two'].value_counts()

# Get column names
df_2.columns

# Get index info
df_2.index

# Return a DF that lists null values as True
df_2.isnull()

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  0.0  4.0


Unnamed: 0,one,two
a,False,False
b,False,False
c,False,False
d,True,False
