## Using DataFrames with Pandas

### Creating a DataFrame

In [1]:
import pandas as pd

certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})

certificates_earned.index = ['Tom', 'Kris', 'Ahmad', 'Beau']

certificates_earned

Unnamed: 0,Certificates,Time (in months)
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


### Getting data from DataFrame

#### Getting a column of the df

In [2]:
certificates_earned['Certificates']

Tom      8
Kris     2
Ahmad    5
Beau     6
Name: Certificates, dtype: int64

#### Getting booleans values for condition in a column

In [3]:
certificates_earned['Certificates'] > 4

Tom       True
Kris     False
Ahmad     True
Beau      True
Name: Certificates, dtype: bool

#### Using .loc[]

In [4]:
print(certificates_earned.loc['Tom']) # printing just the "Tom" line (first one) with loc[]
print(certificates_earned.loc[certificates_earned['Certificates'] > 5]) # printing lines based on a boolean condition with loc[]

print(certificates_earned.loc[certificates_earned['Certificates'] > 5, 'Time (in months)']) # printing lines based on a boolean condition and what column should be showed with loc[]

Certificates         8
Time (in months)    16
Name: Tom, dtype: int64
      Certificates  Time (in months)
Tom              8                16
Beau             6                12
Tom     16
Beau    12
Name: Time (in months), dtype: int64


#### Using .iloc[]

In [5]:
print(certificates_earned.iloc[0]) # printing the first line with iloc[]

print(">Name:", certificates_earned.iloc[-1].name) # printing the last line's index's name with iloc[]
print(">Data Type:", certificates_earned.iloc[-1].dtype) # printing the last line's dtype with iloc[]

print(certificates_earned.iloc[0:2]) # printing the first an interval of lines with iloc[] (first and second lines)

Certificates         8
Time (in months)    16
Name: Tom, dtype: int64
>Name: Beau
>Data Type: int64
      Certificates  Time (in months)
Tom              8                16
Kris             2                 5


### Removing data from DataFrame

#### Using .drop() to remove line by index(es)

In [6]:
certificates_earned.drop('Tom') # dropping one row 

certificates_earned.drop(['Kris', 'Beau']) # dropping multiple rows
certificates_earned.drop(['Kris', 'Beau'], axis=0) # dropping multiple rows, specifying the "axis" (0 = row)
certificates_earned.drop(['Kris', 'Beau'], axis='rows') # dropping multiple rows, specifying the "axis"

Unnamed: 0,Certificates,Time (in months)
Tom,8,16
Ahmad,5,9


#### Using .drop() to remove columns

In [7]:
certificates_earned.drop(columns=['Certificates']) # dropping a column using the "columns" parameter

certificates_earned.drop('Certificates', axis=1) # dropping a column specifying the "axis" (1 = column)
certificates_earned.drop('Certificates', axis='columns') # dropping a column specifying the "axis"

Unnamed: 0,Time (in months)
Tom,16
Kris,5
Ahmad,9
Beau,12


### Operations

In [8]:
certificates_earned['Certificates'] + 2

Tom      10
Kris      4
Ahmad     7
Beau      8
Name: Certificates, dtype: int64

In [9]:
certificates_earned['Certificates'] - 2

Tom      6
Kris     0
Ahmad    3
Beau     4
Name: Certificates, dtype: int64

In [10]:
certificates_earned['Certificates'] * 2

Tom      16
Kris      4
Ahmad    10
Beau     12
Name: Certificates, dtype: int64

In [11]:
certificates_earned['Certificates'] / 2

Tom      4.0
Kris     1.0
Ahmad    2.5
Beau     3.0
Name: Certificates, dtype: float64

In [12]:
certificates_earned['Certificates'] ** 2

Tom      64
Kris      4
Ahmad    25
Beau     36
Name: Certificates, dtype: int64

In [13]:
certificates_earned['Certificates'] % 2

Tom      0
Kris     0
Ahmad    1
Beau     0
Name: Certificates, dtype: int64

#### Operation with a Series

In [14]:
crisis = pd.Series([10, 5], index=['Certificates', 'Time (in months)'])
certificates_earned[['Certificates', 'Time (in months)']] + crisis

Unnamed: 0,Certificates,Time (in months)
Tom,18,21
Kris,12,10
Ahmad,15,14
Beau,16,17


### Modifying DataFrames

#### Adding a new column

In [15]:
longest_streak = pd.Series([13, 11, 9, 7], index=certificates_earned.index)
certificates_earned['Longest streak'] = longest_streak
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak
Tom,8,16,13
Kris,2,5,11
Ahmad,5,9,9
Beau,6,12,7


#### Changing column values

In [16]:
certificates_earned['Longest streak'] = 10
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak
Tom,8,16,10
Kris,2,5,10
Ahmad,5,9,10
Beau,6,12,10


In [17]:
certificates_earned[certificates_earned['Longest streak'] > 10] = 100 # assigning value based on a condition
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak
Tom,8,16,10
Kris,2,5,10
Ahmad,5,9,10
Beau,6,12,10


#### Renaming columns

In [18]:
certificates_earned.rename(
    columns={
        'Time (in months)': 'Number of months'
    }
)

Unnamed: 0,Certificates,Number of months,Longest streak
Tom,8,16,10
Kris,2,5,10
Ahmad,5,9,10
Beau,6,12,10


#### Renaming indexes

In [19]:
certificates_earned.rename(
    index={
        'Kris': 'Chris'
    }
)


Unnamed: 0,Certificates,Time (in months),Longest streak
Tom,8,16,10
Chris,2,5,10
Ahmad,5,9,10
Beau,6,12,10


#### Creating a new column based on other columns

In [20]:
certificates_earned['Certificates per month'] = certificates_earned['Time (in months)'] / certificates_earned['Certificates']
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month
Tom,8,16,10,2.0
Kris,2,5,10,2.5
Ahmad,5,9,10,1.8
Beau,6,12,10,2.0


In [21]:
certificates_earned['Months per certificate'] = round(certificates_earned['Certificates'] / certificates_earned['Time (in months)'], 2)
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month,Months per certificate
Tom,8,16,10,2.0,0.5
Kris,2,5,10,2.5,0.4
Ahmad,5,9,10,1.8,0.56
Beau,6,12,10,2.0,0.5


In [22]:
certificates_earned['Days per certificate'] = certificates_earned['Months per certificate'] * 30
certificates_earned

Unnamed: 0,Certificates,Time (in months),Longest streak,Certificates per month,Months per certificate,Days per certificate
Tom,8,16,10,2.0,0.5,15.0
Kris,2,5,10,2.5,0.4,12.0
Ahmad,5,9,10,1.8,0.56,16.8
Beau,6,12,10,2.0,0.5,15.0


#### Statistical info

In [23]:
certificates = certificates_earned['Certificates']

In [24]:
certificates.describe()

count    4.00
mean     5.25
std      2.50
min      2.00
25%      4.25
50%      5.50
75%      6.50
max      8.00
Name: Certificates, dtype: float64

In [25]:
certificates.mean()

5.25

In [26]:
certificates.std()

2.5

In [27]:
certificates.min(), certificates.max()

(2, 8)

In [28]:
certificates.quantile([.25, .5, .75])

0.25    4.25
0.50    5.50
0.75    6.50
Name: Certificates, dtype: float64

In [29]:
certificates.quantile([.2, .4, .6, .8])

0.2    3.8
0.4    5.2
0.6    5.8
0.8    6.8
Name: Certificates, dtype: float64