#### Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
city_data = {
"City": ["New York City", "Paris", "Barcelona", "Rome"],
"Country": ["United States", "France", "Spain", "Italy"],
"Population": [8600000, 2141000, 5515000, 2873000]
}

In [3]:
# create a DataFrame from a dictionary
city_df = pd.DataFrame(city_data)
city_df

Unnamed: 0,City,Country,Population
0,New York City,United States,8600000
1,Paris,France,2141000
2,Barcelona,Spain,5515000
3,Rome,Italy,2873000


**Transpose data**

In [4]:
city_df.transpose()

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


In [5]:
# the same result using the .T attribute
city_df.T

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


#### Creating a DataFrame from a NumPy ndarray

In [6]:
random_data = np.random.randint(1, 101, size=(3, 5))
random_data

array([[ 97,  88, 100,  48,  94],
       [ 98,  43,  15,  23,  74],
       [ 41,  24,  63,  95,  11]])

In [7]:
pd.DataFrame(data=random_data)

Unnamed: 0,0,1,2,3,4
0,97,88,100,48,94
1,98,43,15,23,74
2,41,24,63,95,11


**Create a dataframe manualy**

In [8]:
row_labels = ["Morning", "Afternoon", "Evening"]
temperatures = pd.DataFrame(data = random_data, index = row_labels)
temperatures

Unnamed: 0,0,1,2,3,4
Morning,97,88,100,48,94
Afternoon,98,43,15,23,74
Evening,41,24,63,95,11


In [9]:
row_labels = ["Morning", "Afternoon", "Evening"]
column_labels = (
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
)

In [10]:
pd.DataFrame(data=random_data, index = row_labels,columns = column_labels)

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday
Morning,97,88,100,48,94
Afternoon,98,43,15,23,74
Evening,41,24,63,95,11


**The Dataframe allows to have duplicates in the index and rows**

In [11]:
# Morning and Tuesday are duplicated
row_labels = ["Morning", "Afternoon", "Morning"]
column_lables_1 = [
    "Monday",
    "Tuesday",
    "Tuesday",
    "Tuesday",
    "Friday"
]

df_Tuesday = pd.DataFrame(
    data = random_data,
    index = row_labels,
    columns = column_lables_1
)

**Bad practise**

In [12]:
df_Tuesday['Tuesday']

Unnamed: 0,Tuesday,Tuesday.1,Tuesday.2
Morning,88,100,48
Afternoon,43,15,23
Morning,24,63,95


#### Import data with CSV

In [13]:
nba_path = '/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 4 Dataframe/Data/nba.csv'

In [117]:
df_nba = pd.read_csv(nba_path)

In [118]:
df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])

  df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])


**Alternative way to parse date in the Dataframe**

In [None]:
df_nba['Birthday'] = pd.to_datetime(df_nba['Birthday'], format='%m/%d/%y')
# change the date format to 'YYYY-MM-DD'
df_nba['Birthday'] = df_nba['Birthday'].dt.strftime('%Y-%m-%d')

In [119]:
df_nba.shape

(450, 5)

In [120]:
type(df_nba)

pandas.core.frame.DataFrame

In [121]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


#### Shared and Exclusive Attributes between Series and DataFrames

In [122]:
# check the data types of each column
df_nba.dtypes

Name                object
Team                object
Position            object
Birthday    datetime64[ns]
Salary               int64
dtype: object

In [21]:
# Count values for each data type
df_nba.dtypes.value_counts()

object            3
datetime64[ns]    1
int64             1
Name: count, dtype: int64

In [22]:
# View the index
df_nba.index

RangeIndex(start=0, stop=450, step=1)

In [23]:
# View the columns
df_nba.columns

Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [24]:
# method ndim shows the number of dimensions of the DataFrame = 2 and Series = 1
df_nba.ndim

2

In [25]:
# method shape shows the number of rows and columns
df_nba.shape

(450, 5)

In [26]:
# method size shows the number of elements in the DataFrame
df_nba.size

2250

In [27]:
# method count shows the number of non-null values in each column
df_nba.count()

Name        450
Team        450
Position    450
Birthday    450
Salary      450
dtype: int64

#### Shared Methods between Series and DataFrames

In [28]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [29]:
df_nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [31]:
# method sample shows random rows from the DataFrame
df_nba.sample(5)

Unnamed: 0,Name,Team,Position,Birthday,Salary
357,Shabazz Napier,Minnesota Timberwolves,PG,1991-07-14,1845301
59,Marcus Smart,Boston Celtics,PG,1994-03-06,12553571
168,Justin James,Sacramento Kings,SG,1997-01-24,898310
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
22,George Hill,Milwaukee Bucks,PG,1986-05-04,10133907


In [30]:
# method nunique shows the number of unique values in each column
df_nba.nunique()

Name        450
Team         30
Position      9
Birthday    430
Salary      269
dtype: int64

In [32]:
# method max shows the maximum value in each column. It's not one row with maximum values
df_nba.max()

Name             Zylan Cheatham
Team         Washington Wizards
Position                     SG
Birthday    2000-12-23 00:00:00
Salary                 40231758
dtype: object

In [33]:
df_nba[df_nba['Name'] == 'Zylan Cheatham']

Unnamed: 0,Name,Team,Position,Birthday,Salary
248,Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568


In [34]:
# method min shows the minimum value in each column. It's not one row with minimum values
df_nba.min()

Name               Aaron Gordon
Team              Atlanta Hawks
Position                      C
Birthday    1977-01-26 00:00:00
Salary                    79568
dtype: object

In [35]:
# method nlargest shows the n largest values in a column
df_nba.nlargest(n=4, columns='Salary')

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000


In [36]:
# method nsmallest shows the n smallest values in a column
df_nba.nsmallest(n=4, columns='Birthday')

Unnamed: 0,Name,Team,Position,Birthday,Salary
98,Vince Carter,Atlanta Hawks,PF,1977-01-26,2564753
196,Udonis Haslem,Miami Heat,C,1980-06-09,2564753
262,Kyle Korver,Milwaukee Bucks,PF,1981-03-17,6004753
149,Tyson Chandler,Houston Rockets,C,1982-10-02,2564753


**Find the salary sum**

In [37]:
# numeric_only parameter returns only numeric columns
df_nba.sum(numeric_only=True)

Salary    3444112694
dtype: int64

**Average salary in the NBA**

In [38]:
# method mean shows the mean value in each numeric column
df_nba.mean(numeric_only=True).round(2)

Salary    7653583.76
dtype: float64

In [39]:
# method median shows the median value in each numeric column
df_nba.median(numeric_only=True).round(2)

Salary    3303074.5
dtype: float64

In [40]:
# method mode shows the most frequently occurring value in each column
df_nba.mode(numeric_only=True)

Unnamed: 0,Salary
0,79568


In [41]:
# method std shows the standard deviation in each numeric column
df_nba.std(numeric_only=True).round(2)

Salary    9288810.3
dtype: float64

#### Sorting a DataFrame

In [42]:
# Sorting by a single column
df_nba.sort_values(by='Salary', ascending=False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000
264,James Harden,Houston Rockets,PG,1989-08-26,38199000


In [43]:
# Same result using the .sort_index() method    
df_nba.sort_values('Name').head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
52,Aaron Gordon,Orlando Magic,PF,1995-09-16,19863636
101,Aaron Holiday,Indiana Pacers,PG,1996-09-30,2239200
437,Abdel Nader,Oklahoma City Thunder,SF,1993-09-25,1618520
81,Adam Mokoka,Chicago Bulls,G,1998-07-18,79568
399,Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000


**Sorting by few columns**

In [44]:
# using different sorting orders for multiple columns ascending=[True, False]
df_nba.sort_values(by=['Team', 'Salary'], ascending=[True, False]).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
111,Chandler Parsons,Atlanta Hawks,SF,1988-10-25,25102512
28,Evan Turner,Atlanta Hawks,PG,1988-10-27,18606556
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000
213,De'Andre Hunter,Atlanta Hawks,SF,1997-12-02,7068360
339,Jabari Parker,Atlanta Hawks,PF,1995-03-15,6500000


**Sorting by index**

In [45]:
df_nba.sort_index(ascending=True).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [46]:
# Last row in the DataFrame will be the first one
df_nba.sort_index(ascending=False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310


**Sorting by Column Index**

In [47]:

df_nba.sort_index(axis='columns').head()

Unnamed: 0,Birthday,Name,Position,Salary,Team
0,1996-09-26,Shake Milton,SG,1445697,Philadelphia 76ers
1,1995-09-27,Christian Wood,PF,1645357,Detroit Pistons
2,1998-08-23,PJ Washington,PF,3831840,Charlotte Hornets
3,1988-10-04,Derrick Rose,PG,7317074,Detroit Pistons
4,1995-07-26,Marial Shayok,G,79568,Philadelphia 76ers


In [48]:
df_nba.sort_index(axis=1).head()

Unnamed: 0,Birthday,Name,Position,Salary,Team
0,1996-09-26,Shake Milton,SG,1445697,Philadelphia 76ers
1,1995-09-27,Christian Wood,PF,1645357,Detroit Pistons
2,1998-08-23,PJ Washington,PF,3831840,Charlotte Hornets
3,1988-10-04,Derrick Rose,PG,7317074,Detroit Pistons
4,1995-07-26,Marial Shayok,G,79568,Philadelphia 76ers


In [49]:
# change the order of columns in the DataFrame using column index
df_nba.sort_index(axis='columns', ascending=False).head()

Unnamed: 0,Team,Salary,Position,Name,Birthday
0,Philadelphia 76ers,1445697,SG,Shake Milton,1996-09-26
1,Detroit Pistons,1645357,PF,Christian Wood,1995-09-27
2,Charlotte Hornets,3831840,PF,PJ Washington,1998-08-23
3,Detroit Pistons,7317074,PG,Derrick Rose,1988-10-04
4,Philadelphia 76ers,79568,G,Marial Shayok,1995-07-26


#### Setting a New Index

In [124]:
# Set 'Name' column as the index
df_nba = df_nba.set_index('Name').head()

In [125]:
df_nba.head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [126]:
# If the column became the index, it disappears from the columns
# If you want to reset the index to the default integer index, you can use the .reset_index() method
df_nba = df_nba.reset_index()
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [127]:
#the same result using the .set_index() method
df_nba = df_nba.set_index(keys='Name')
df_nba.head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


#### Drop columns

In [53]:
df_nba.drop(columns='index', inplace=True)

In [54]:
df_nba.head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


#### Selecting Columns and Rows from a DataFrame

##### Selecting a Single Column from a DataFrame

In [55]:
# select a single column using the column name via dot notation
df_nba.Salary

Name
Shake Milton       1445697
Christian Wood     1645357
PJ Washington      3831840
Derrick Rose       7317074
Marial Shayok        79568
                    ...   
Austin Rivers      2174310
Harry Giles        2578800
Robin Lopez        4767000
Collin Sexton      4764960
Ricky Rubio       16200000
Name: Salary, Length: 450, dtype: int64

In [56]:
# select a single column using the column name via bracket notation
df_nba['Team']

Name
Shake Milton       Philadelphia 76ers
Christian Wood        Detroit Pistons
PJ Washington       Charlotte Hornets
Derrick Rose          Detroit Pistons
Marial Shayok      Philadelphia 76ers
                         ...         
Austin Rivers         Houston Rockets
Harry Giles          Sacramento Kings
Robin Lopez           Milwaukee Bucks
Collin Sexton     Cleveland Cavaliers
Ricky Rubio              Phoenix Suns
Name: Team, Length: 450, dtype: object

##### Selecting a multiple Column from a DataFrame

In [57]:
# select multiple columns using a list of column names
df_nba[['Team','Position']]

Unnamed: 0_level_0,Team,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Shake Milton,Philadelphia 76ers,SG
Christian Wood,Detroit Pistons,PF
PJ Washington,Charlotte Hornets,PF
Derrick Rose,Detroit Pistons,PG
Marial Shayok,Philadelphia 76ers,G
...,...,...
Austin Rivers,Houston Rockets,PG
Harry Giles,Sacramento Kings,PF
Robin Lopez,Milwaukee Bucks,C
Collin Sexton,Cleveland Cavaliers,PG


##### Selecting a column by data type

**include**

In [58]:
# Select all columns of a specific data type include=['object']
df_nba.select_dtypes(include=['object']).head()

Unnamed: 0_level_0,Team,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Shake Milton,Philadelphia 76ers,SG
Christian Wood,Detroit Pistons,PF
PJ Washington,Charlotte Hornets,PF
Derrick Rose,Detroit Pistons,PG
Marial Shayok,Philadelphia 76ers,G


**exclude**

In [59]:
# Select all columns except specific data types exclude=['object','int64']
df_nba.select_dtypes(exclude=['object','int64']).head()

Unnamed: 0_level_0,Birthday
Name,Unnamed: 1_level_1
Shake Milton,1996-09-26
Christian Wood,1995-09-27
PJ Washington,1998-08-23
Derrick Rose,1988-10-04
Marial Shayok,1995-07-26


#### Selecting Rows from a DataFrame

##### Extracting Rows by Index Label

In [60]:
df_nba.loc["LeBron James"]

Team         Los Angeles Lakers
Position                     PF
Birthday    1984-12-30 00:00:00
Salary                 37436858
Name: LeBron James, dtype: object

In [61]:
# select multiple rows by index labels using a list of index labels
df_nba.loc[["Kawhi Leonard", "Paul George"]]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kawhi Leonard,Los Angeles Clippers,SF,1991-06-29,32742000
Paul George,Los Angeles Clippers,SF,1990-05-02,33005556


In [62]:
#extract rows from "Otto Porter" to the end "Patrick Beverley"
df_nba.sort_index().loc["Otto Porter":"Patrick Beverley"]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Otto Porter,Chicago Bulls,SF,1993-06-03,27250576
PJ Dozier,Denver Nuggets,PG,1996-10-25,79568
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Pascal Siakam,Toronto Raptors,PF,1994-04-02,2351838
Pat Connaughton,Milwaukee Bucks,SG,1993-01-06,1723050
Patrick Beverley,Los Angeles Clippers,PG,1988-07-12,12345680


In [63]:
# extract rows from "Zach Collins" to the end
df_nba.sort_index().loc["Zach Collins":]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zach Collins,Portland Trail Blazers,C,1997-11-19,4240200
Zach LaVine,Chicago Bulls,PG,1995-03-10,19500000
Zach Norvell,Los Angeles Lakers,SG,1997-12-09,79568
Zhaire Smith,Philadelphia 76ers,SG,1999-06-04,3058800
Zion Williamson,New Orleans Pelicans,F,2000-07-06,9757440
Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568


In [64]:
# extract rows from the beginning to "Al Horford"
df_nba.sort_index().loc[:"Al Horford"]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Gordon,Orlando Magic,PF,1995-09-16,19863636
Aaron Holiday,Indiana Pacers,PG,1996-09-30,2239200
Abdel Nader,Oklahoma City Thunder,SF,1993-09-25,1618520
Adam Mokoka,Chicago Bulls,G,1998-07-18,79568
Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000
Al Horford,Philadelphia 76ers,C,1986-06-03,28000000


#### Extracting Rows by Index Position

In [65]:
# Extracting Rows by Index Position
df_nba.iloc[110]

Team           Sacramento Kings
Position                     PF
Birthday    1988-05-09 00:00:00
Salary                  6825000
Name: Nemanja Bjelica, dtype: object

In [66]:
# Extracting Rows by Index Position
df_nba.iloc[[100, 200, 300, 400]]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Brian Bowen,Indiana Pacers,SG,1998-10-02,79568
Marco Belinelli,San Antonio Spurs,SF,1986-03-25,5846154
Jarred Vanderbilt,Denver Nuggets,PF,1999-04-03,1416852
Louis King,Detroit Pistons,F,1999-04-06,79568


In [67]:
# Extracting Rows by Index Position using slicing
df_nba.iloc[111:116]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chandler Parsons,Atlanta Hawks,SF,1988-10-25,25102512
Courtney Lee,Dallas Mavericks,SG,1985-10-03,12759670
Myles Turner,Indiana Pacers,C,1996-03-24,18000000
Kyle O'Quinn,Philadelphia 76ers,C,1990-03-26,2174318
Bryn Forbes,San Antonio Spurs,SG,1993-07-23,2875000


In [68]:
df_nba.iloc[:5]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [69]:
df_nba.iloc[445:]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [70]:
# Extracting Rows by Index Position using negative indexing from the last 10 to the last 5
df_nba.iloc[-10:-5]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jared Dudley,Los Angeles Lakers,PF,1985-07-10,2564753
Max Strus,Chicago Bulls,SG,1996-03-28,79568
Kevon Looney,Golden State Warriors,C,1996-02-06,4464286
Willy Hernangomez,Charlotte Hornets,C,1994-05-27,1557250
Melvin Frazier,Orlando Magic,SG,1996-08-30,1416852


In [71]:
# Extracting Rows by Index Position using slicing with a step 2 => 0, 2, 4, 6, 8
df_nba.iloc[0:10:2]

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568
Kendrick Nunn,Miami Heat,SG,1995-08-03,1416852
Brook Lopez,Milwaukee Bucks,C,1988-04-01,12093024


#### Extracting Values from Specific Columns

In [72]:
# Extracting Values from Specific Columns with specific Index Labels like a crossroad
df_nba.loc["LeBron James", "Team"]

'Los Angeles Lakers'

In [73]:
# Extracting Values from Specific Columns using index label. Rreturns a Series "Team", "Salary", "Position"
df_nba.loc["James Harden", ["Team", "Salary", "Position"]]

Team        Houston Rockets
Salary             38199000
Position                 PG
Name: James Harden, dtype: object

In [74]:
# Extracting Values from Specific Columns using index labels. Returns a DataFrame
df_nba.loc[["Russell Westbrook", "Anthony Davis"], ["Team", "Position"] ]

Unnamed: 0_level_0,Team,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Russell Westbrook,Houston Rockets,PG
Anthony Davis,Los Angeles Lakers,C


In [75]:
# Extracting Values from Specific Columns using index labels and slicing of columns.
df_nba.loc["James Harden", "Position":"Salary"]

Position                     PG
Birthday    1989-08-26 00:00:00
Salary                 38199000
Name: James Harden, dtype: object

In [76]:
# Extracting Values from Specific Columns using index position and index of the column. 3 - "Salary" column.
df_nba.iloc[57, 3]

796806

In [77]:
# Extracting Values from Specific Columns using index position and slicing of columns. First 3 columns
df_nba.iloc[100:105, :3]

Unnamed: 0_level_0,Team,Position,Birthday
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brian Bowen,Indiana Pacers,SG,1998-10-02
Aaron Holiday,Indiana Pacers,PG,1996-09-30
Troy Daniels,Los Angeles Lakers,SG,1991-07-15
Buddy Hield,Sacramento Kings,SG,1992-12-17
Terance Mann,Los Angeles Clippers,SG,1996-10-18


##### Attributes at and iat return only single value

In [78]:
# Series.at : Access a single value by label.
df_nba.at["LeBron James", "Team"]

'Los Angeles Lakers'

In [79]:
df_nba.iat[263, 1]

'PF'

##### Extracting Values from Series

In [80]:
df_nba["Salary"].loc["Damian Lillard"]

29802321

In [81]:
df_nba["Salary"].at["Damian Lillard"]

29802321

In [82]:
df_nba["Salary"].iloc[234]

2033160

In [83]:
df_nba["Salary"].iat[234]

2033160

##### Speed of performance

In [84]:
%%timeit
df_nba.at["Austin Rivers", "Birthday"]

2.36 μs ± 187 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [85]:
%%timeit
df_nba.loc["Austin Rivers", "Birthday"]

3.61 μs ± 31.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [86]:
%%timeit
df_nba.iat[123, 1]

4.2 μs ± 131 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [87]:
%%timeit
df_nba.iloc[263, 1]

5.68 μs ± 278 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


#### Renaming Columns or Rows

In [88]:
df_nba.columns

Index(['Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [89]:
df_nba.columns = ["Team", "Position", "Date of Birth", "Pay"]
df_nba.head(1)

Unnamed: 0_level_0,Team,Position,Date of Birth,Pay
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697


**Columns were renamed**

In [90]:
df_nba.columns

Index(['Team', 'Position', 'Date of Birth', 'Pay'], dtype='object')

**Use rename function to change column name**

In [91]:
df_nba = df_nba.rename(columns={"Date of Birth": "Birthday"})

In [92]:
df_nba.columns

Index(['Team', 'Position', 'Birthday', 'Pay'], dtype='object')

**Change index name**

In [93]:
df_nba.loc["Giannis Antetokounmpo"]

Team            Milwaukee Bucks
Position                     PF
Birthday    1994-12-06 00:00:00
Pay                    25842697
Name: Giannis Antetokounmpo, dtype: object

In [94]:
df_nba = df_nba.rename(index={"Giannis Antetokounmpo": "Greek Freak"})

In [95]:
df_nba.loc["Greek Freak"]

Team            Milwaukee Bucks
Position                     PF
Birthday    1994-12-06 00:00:00
Pay                    25842697
Name: Greek Freak, dtype: object

#### Resetting an Index

**At first the Dataframe should be reset. It will save Name coulmn**

In [102]:
df_nba = df_nba.reset_index()

In [105]:
df_nba.head(1)

Unnamed: 0,Team,Name,Position,Birthday,Pay
0,Philadelphia 76ers,Shake Milton,SG,1996-09-26,1445697


In [None]:
df_nba = df_nba.set_index('Team')

Unnamed: 0_level_0,Name,Position,Birthday,Pay
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Philadelphia 76ers,Shake Milton,SG,1996-09-26,1445697
Detroit Pistons,Christian Wood,PF,1995-09-27,1645357
Charlotte Hornets,PJ Washington,PF,1998-08-23,3831840
Detroit Pistons,Derrick Rose,PG,1988-10-04,7317074
Philadelphia 76ers,Marial Shayok,G,1995-07-26,79568
...,...,...,...,...
Houston Rockets,Austin Rivers,PG,1992-08-01,2174310
Sacramento Kings,Harry Giles,PF,1998-04-22,2578800
Milwaukee Bucks,Robin Lopez,C,1988-04-01,4767000
Cleveland Cavaliers,Collin Sexton,PG,1999-01-04,4764960


In [109]:
df_nba.head()

Unnamed: 0_level_0,Name,Position,Birthday,Pay
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Philadelphia 76ers,Shake Milton,SG,1996-09-26,1445697
Detroit Pistons,Christian Wood,PF,1995-09-27,1645357
Charlotte Hornets,PJ Washington,PF,1998-08-23,3831840
Detroit Pistons,Derrick Rose,PG,1988-10-04,7317074
Philadelphia 76ers,Marial Shayok,G,1995-07-26,79568


#### Task

In [131]:
nfl_path = '/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 4 Dataframe/Data/nfl.csv'

In [132]:
df_nfl = pd.read_csv(nfl_path, parse_dates=['Birthday'])

In [133]:
df_nfl['Birthday'] = df_nfl['Birthday'].dt.strftime('%Y-%m-%d')

In [139]:
df_nfl['Birthday'] = pd.to_datetime(df_nfl['Birthday'],format='%Y-%m-%d')

In [140]:
df_nfl.head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tremon Smith,Philadelphia Eagles,RB,1996-07-20,570000
Shawn Williams,Cincinnati Bengals,SS,1991-05-13,3500000
Adam Butler,New England Patriots,DT,1994-04-12,645000
Derek Wolfe,Denver Broncos,DE,1990-02-24,8000000
Jake Ryan,Jacksonville Jaguars,OLB,1992-02-27,1000000


In [141]:
df_nfl.dtypes

Team                object
Position            object
Birthday    datetime64[ns]
Salary               int64
dtype: object

In [136]:
df_nfl = df_nfl.set_index(keys='Name')
df_nfl.head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tremon Smith,Philadelphia Eagles,RB,1996-07-20,570000
Shawn Williams,Cincinnati Bengals,SS,1991-05-13,3500000
Adam Butler,New England Patriots,DT,1994-04-12,645000
Derek Wolfe,Denver Broncos,DE,1990-02-24,8000000
Jake Ryan,Jacksonville Jaguars,OLB,1992-02-27,1000000


**Count players by teams**

In [142]:
df_nfl.Team.value_counts().head()

Team
New York Jets          58
Kansas City Chiefs     56
Washington Redskins    56
New Orleans Saints     55
San Francisco 49Ers    55
Name: count, dtype: int64

In [143]:
df_nfl.sort_values(by='Salary', ascending=False).head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kirk Cousins,Minnesota Vikings,QB,1988-08-19,27500000
Jameis Winston,Tampa Bay Buccaneers,QB,1994-01-06,20922000
Marcus Mariota,Tennessee Titans,QB,1993-10-30,20922000
Derek Carr,Oakland Raiders,QB,1991-03-28,19900000
Jimmy Garoppolo,San Francisco 49Ers,QB,1991-11-02,17200000


**Sorting by Team and Salary columns**

In [144]:
df_nfl.sort_values(by=['Team', 'Salary'], ascending=[True, False]).head()

Unnamed: 0_level_0,Team,Position,Birthday,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chandler Jones,Arizona Cardinals,OLB,1990-02-27,16500000
Patrick Peterson,Arizona Cardinals,CB,1990-07-11,11000000
Larry Fitzgerald,Arizona Cardinals,WR,1983-08-31,11000000
David Johnson,Arizona Cardinals,RB,1991-12-16,5700000
Justin Pugh,Arizona Cardinals,G,1990-08-15,5000000


In [148]:
df_nfl = df_nfl.reset_index().set_index(keys = "Team")
df_nfl.head(3)

Unnamed: 0_level_0,Name,Position,Birthday,Salary
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Philadelphia Eagles,Tremon Smith,RB,1996-07-20,570000
Cincinnati Bengals,Shawn Williams,SS,1991-05-13,3500000
New England Patriots,Adam Butler,DT,1994-04-12,645000


In [151]:
df_nfl.loc["New York Jets"].sort_values("Birthday").head(1)

Unnamed: 0_level_0,Name,Position,Birthday,Salary
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New York Jets,Ryan Kalil,C,1985-03-29,2400000


In [147]:
df_nfl['Birthday'].sort_values(ascending=True).head(1)

Name
Tom Brady   1977-08-03
Name: Birthday, dtype: datetime64[ns]