## Using DataFrames with Pandas

### Creating a DataFrame

In [88]:
import pandas as pd

certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})

certificates_earned.index = ['Tom', 'Kris', 'Ahmad', 'Beau']

certificates_earned

Unnamed: 0,Certificates,Time (in months)
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


### Getting data from DataFrame

#### Getting a column of the df

In [None]:
certificates_earned['Certificates']

#### Getting booleans values for condition in a column

In [None]:
certificates_earned['Certificates'] > 4

#### Using .loc[]

In [None]:
print(certificates_earned.loc['Tom']) # printing just the "Tom" line (first one) with loc[]
print(certificates_earned.loc[certificates_earned['Certificates'] > 5]) # printing lines based on a boolean condition with loc[]

print(certificates_earned.loc[certificates_earned['Certificates'] > 5, 'Time (in months)']) # printing lines based on a boolean condition and what column should be showed with loc[]

#### Using .iloc[]

In [None]:
print(certificates_earned.iloc[0]) # printing the first line with iloc[]

print(">Name:", certificates_earned.iloc[-1].name) # printing the last line's index's name with iloc[]
print(">Data Type:", certificates_earned.iloc[-1].dtype) # printing the last line's dtype with iloc[]

print(certificates_earned.iloc[0:2]) # printing the first an interval of lines with iloc[] (first and second lines)

### Removing data from DataFrame

#### Using .drop() to remove line by index(es)

In [None]:
certificates_earned.drop('Tom') # dropping one row 

certificates_earned.drop(['Kris', 'Beau']) # dropping multiple rows
certificates_earned.drop(['Kris', 'Beau'], axis=0) # dropping multiple rows, specifying the "axis" (0 = row)
certificates_earned.drop(['Kris', 'Beau'], axis='rows') # dropping multiple rows, specifying the "axis"

#### Using .drop() to remove columns

In [None]:
certificates_earned.drop(columns=['Certificates']) # dropping a column using the "columns" parameter

certificates_earned.drop('Certificates', axis=1) # dropping a column specifying the "axis" (1 = column)
certificates_earned.drop('Certificates', axis='columns') # dropping a column specifying the "axis"

### Operations

In [None]:
certificates_earned['Certificates'] + 2

In [None]:
certificates_earned['Certificates'] - 2

In [None]:
certificates_earned['Certificates'] * 2

In [None]:
certificates_earned['Certificates'] / 2

In [None]:
certificates_earned['Certificates'] ** 2

In [None]:
certificates_earned['Certificates'] % 2

#### Operation with a Series

In [None]:
crisis = pd.Series([10, 5], index=['Certificates', 'Time (in months)'])
certificates_earned[['Certificates', 'Time (in months)']] + crisis

### Modifying DataFrames

#### Adding a new column

In [89]:
longest_streak = pd.Series([13, 11, 9, 7], index=certificates_earned.index)
certificates_earned['Longest streak'] = longest_streak
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak
Tom,8,16,13
Kris,2,5,11
Ahmad,5,9,9
Beau,6,12,7


#### Changing column values

In [None]:
certificates_earned['Longest streak'] = 10
certificates_earned

In [None]:
certificates_earned[certificates_earned['Longest streak'] > 10] = 100 # assigning value based on a condition
certificates_earned

#### Renaming columns

In [75]:
certificates_earned.rename(
    columns={
        'Time (in months)': 'Number of months'
    }
)

Unnamed: 0,Certificates,Number of months,Longest streak
Tom,8,16,13
Kris,2,5,11
Ahmad,5,9,9
Beau,6,12,7


#### Renaming indexes

In [None]:
certificates_earned.rename(
    index={
        'Kris': 'Chris'
    }
)


#### Creating a new column based on other columns

In [91]:
certificates_earned['Certificates per month'] = certificates_earned['Time (in months)'] / certificates_earned['Certificates']
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month
Tom,8,16,13,2.0
Kris,2,5,11,2.5
Ahmad,5,9,9,1.8
Beau,6,12,7,2.0


In [92]:
certificates_earned['Months per certificate'] = round(certificates_earned['Certificates'] / certificates_earned['Time (in months)'], 2)
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month,Months per certificate
Tom,8,16,13,2.0,0.5
Kris,2,5,11,2.5,0.4
Ahmad,5,9,9,1.8,0.56
Beau,6,12,7,2.0,0.5


In [93]:
certificates_earned['Days per certificate'] = certificates_earned['Months per certificate'] * 30
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month,Months per certificate,Days per certificate
Tom,8,16,13,2.0,0.5,15.0
Kris,2,5,11,2.5,0.4,12.0
Ahmad,5,9,9,1.8,0.56,16.8
Beau,6,12,7,2.0,0.5,15.0


#### Statistical info

In [97]:
certificates = certificates_earned['Certificates']

In [98]:
certificates.describe()

count    4.00
mean     5.25
std      2.50
min      2.00
25%      4.25
50%      5.50
75%      6.50
max      8.00
Name: Certificates, dtype: float64

In [100]:
certificates.mean()

5.25

In [101]:
certificates.std()

2.5

In [102]:
certificates.min(), certificates.max()

(2, 8)

In [104]:
certificates.quantile([.25, .5, .75])

0.25    4.25
0.50    5.50
0.75    6.50
Name: Certificates, dtype: float64

In [107]:
certificates.quantile([.2, .4, .6, .8])

0.2    3.8
0.4    5.2
0.6    5.8
0.8    6.8
Name: Certificates, dtype: float64