## Pandas Library
*I used a blogpost at dataquest.io as a template for this module*

### Import pandas
---

In [None]:
import pandas as pd

### Import a dataset
---

In [None]:
df = pd.read_csv('ibm_dataset.csv')

### Viewing your dataframe
---

In [None]:
df.head() # show the first 5 rows

In [None]:
df.tail() # show the last 5 rows

In [None]:
df.shape # show the number of rows and columns

In [None]:
df.info() # show datatypes, non-null value counts, and memory usage

In [None]:
df['make'].value_counts() # show the number of times a value occurs in a column

In [None]:
df.columns # list column names

### Selection
---

In [None]:
df['make'] # select a specific column with df['column']

In [None]:
df[['drive-wheels', 'body-style', 'make']] # select multiple columns

In [None]:
df.iloc[2] # view the 2nd row

In [None]:
df.loc[2] # view the record with an index of 2

### Drop Columns:
---

In [None]:
df.drop(columns=['Unnamed: 0']) # drop column 'Unnamed: 0'

### Null Values
---

#### Columns:

In [None]:
df.dropna(axis=1) # Drop columns that contain any null values

In [None]:
df.dropna(axis=1, how='all') # drop columns that contain only null values

#### Rows:

In [None]:
df.dropna() # Drop rows that contain any null values

In [None]:
df.dropna(how='all') # drop rows that contain only null values

In [None]:
df.isnull() # returns true for null values, fasle for not null. useful for boolean masking.

In [None]:
df.notnull() # returns true for non-null values, false for null values. useful for boolean masking.

In [None]:
df.fillna('returning to baker street') # replaces nan values with a value of your choosing

### Rename & Replace
---

In [None]:
df.columns.values[0:3] = ['Unnamed: 0', 'symboling', 'normalized-losses'] # change the first three column names

In [None]:
df.rename({'symboling': 'butterscotch', 'normalized': 'summer'}, axis=1) # Rename specific columns

In [None]:
df.set_index(df['make']) # use a list or series to update your index

In [None]:
df.reset_index() # convert the current index into a column,and generate a fresh index

In [None]:
df['num-of-doors'].replace('two', 2) # find and replace values

In [None]:
df['symboling'].astype(str) # convert column to specified type

### Filter & Sort
---

In [None]:
df[df['make']=='volkswagon'] # rows where make equals volkswagon

In [None]:
df[(df['make']=='audi') & (df['horsepower']>110)] # rows where make equals audi AND horsepower is greater than 100

In [None]:
df[(df['body-style']=='hatchback') | (df['body-style']=='wagon')] # rows where body-style is hatcback OR sedan

In [None]:
df.sort_values(by=['city-mpg'], ascending=False) # sort city-mpg from largest to smallest

### Groupby & Pivot Table
---
*Example taken from the IBM online course "Python for Data Analytics"*

1. #### Create a subset that you want to analyze:

In [None]:
df_test = df[['drive-wheels', 'body-style', 'price']]

2. #### initiate groupby: 

In [None]:
df_grp = df_test.groupby(['drive-wheels', 'body-style'], as_index=False).mean()
df_grp

3. #### Generate pivot table from groupby:

In [None]:
df_pivot = df_grp.pivot(index = 'drive-wheels', columns='body-style')
df_pivot

### Statistics:
---

In [None]:
df.mean() # mean average of each column

In [None]:
df.median() # median value of each column

In [None]:
df.count() # count of non-null values in each column

In [None]:
df.max() # maximum value of each column

In [None]:
df.min() # minimum value of each column

In [None]:
df.std() # standard deviation of each column

In [None]:
df.describe() # statistics for numerical columns

In [None]:
df.corr() # returns a grid of pearson coefficients

### Charts & Plots:
---

#### Set up Seaborn:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

#### Scatterplot:

In [None]:
scatter = sns.scatterplot(x=df['horsepower'], y=df['price'])

#### Scatterplot w/ regression:

In [None]:
regression_plot = sns.regplot(x=df['horsepower'], y=df['price'])

#### Boxplot:

In [None]:
box_plot = sns.boxplot(x=df['horsepower'])

#### Histogram:

In [None]:
histogram = sns.distplot(df['horsepower'])

#### Column:

In [None]:
column_chart = sns.countplot(x=df['body-style'])

#### Line:
* typically, line graphs are for showing trends over time. since there is no time field in this dataset,  
I used price for the x-axis instead.

In [None]:
a_line_plot = sns.lineplot(x=df['price'], y=df['horsepower'])

#### Swarmplot:

In [None]:
swarm_plot = sns.swarmplot(x=df['body-style'], y=df['price'])