## Section 3: DataFrames I

## Intro to DataFrames I Module

In [1]:
import pandas as pd

In [264]:
nba = pd.read_csv("nba.csv")
## NaN in pandas dataframe means NULL or blank
## The last row shows NULL for every single column.
## Though the Age and Weight might be integer in Excel, if there are any NaN or NULL values within a column, pandas has
## to default to a floating point in order to be able to store them.

## Shared Methods and Attributes between pandas Series and DataFrames

In [6]:
## Series is one dimension, dataframe is two dimensions(columns and rows)
nba.head()
nba.head(7)
nba.tail()
nba.tail(7)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [268]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [8]:
nba.values
## it gives me a nested array(multi-dimensional array), a list of lists

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [9]:
nba.shape

(458, 9)

In [278]:
nba.size
## which is 458 * 9

4122

In [15]:
## nba.dtype
## This does not work, DataFrame has no .dtype attribute
nba.dtypes
## DataFrame has .dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [16]:
## Exclusive attributes for DataFrame
## .columns gives the column names
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [17]:
## .axes combines the axes that make up by dataframe: combine the .index and .columns
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [22]:
## Exclusive method .info() on a dataframe
## .info() provides a summary of the dataframe
nba.info()
nba.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [26]:
nba['Salary'].head(3)

0    7730337.0
1    6796117.0
2          NaN
Name: Salary, dtype: float64

In [27]:
## Exclusive Methods for DataFrame: .get_dtype_counts()
nba.get_dtype_counts()

float64    4
object     5
dtype: int64

## Differences between Shared Methods

In [41]:
rev = pd.read_csv('revenue.csv', index_col = 'Date')
rev.head()

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253


In [280]:
s = pd.Series([1, 2, 3])
s
s.sum()

0    1
1    2
2    3
dtype: int64

In [37]:
rev.sum()
## .sum() gives us the sum of each column

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [42]:
nba.sum()

Number    8.079000e+03
Age       1.231100e+04
Weight    1.012360e+05
Salary    2.159837e+09
dtype: float64

In [44]:
## .sum(axis=0/'index') gives us the sum by column(default)
rev.sum(axis = 0)
rev.sum(axis = 'index')

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [47]:
## .sum(axis = 1/'columns') gives us the sum by row
rev.sum(axis = 1)
rev.sum(axis = 'columns')

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [49]:
rev['Miami'].min()

115

In [53]:
rev.min(axis = 1)

Date
1/1/16     122
1/2/16     534
1/3/16      14
1/4/16     730
1/5/16      71
1/6/16     497
1/7/16     115
1/8/16     492
1/9/16     823
1/10/16     54
dtype: int64

## Select One Column from a DataFrame

In [55]:
nba = pd.read_csv('nba.csv')
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [57]:
## First Method: not 100% work, it does not work when there is space in the column name
nba.Name.head()

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
Name: Name, dtype: object

In [61]:
nba.Salary
Output = None
## Output = None will prevent output from the commands in that cell

In [65]:
## Second way --  always works, prefer this way
nba['Salary'].head()

0    7730337.0
1    6796117.0
2          NaN
3    1148640.0
4    5000000.0
Name: Salary, dtype: float64

In [67]:
nba[['Salary', 'Team']].head()

Unnamed: 0,Salary,Team
0,7730337.0,Boston Celtics
1,6796117.0,Boston Celtics
2,,Boston Celtics
3,1148640.0,Boston Celtics
4,5000000.0,Boston Celtics


In [70]:
type(nba['Name'])

pandas.core.series.Series

In [76]:
nba['Salary'][nba['Salary'].idxmax()]

25000000.0

In [79]:
nba['Name'][1:3]

1     Jae Crowder
2    John Holland
Name: Name, dtype: object

## Select Two or More Columns from a DataFrame

In [84]:
## When extracts two or more columns from a dataframe, a new dataframe will be returned as output
nba[['Name', 'Team']].head()
nba[['Team', 'Name']].head()  ## The order of the variables show in sequence

Unnamed: 0,Team,Name
0,Boston Celtics,Avery Bradley
1,Boston Celtics,Jae Crowder
2,Boston Celtics,John Holland
3,Boston Celtics,R.J. Hunter
4,Boston Celtics,Jonas Jerebko


In [290]:
nba[nba.columns[1:5]].head(3)

Unnamed: 0,Team,Number,Position,Age
0,Boston Celtics,0.0,PG,25.0
1,Boston Celtics,99.0,SF,25.0
2,Boston Celtics,30.0,SG,27.0


In [93]:
select = ['Salary', 'Team', 'Name']
nba[select].head(3)

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland


## Add New Column to DataFrame

In [94]:
## use nba dataframe
##  nba['Sport'] does not exists in nba dataframe
nba['Sport'] = 'Basketball'
## This is a scalar value which means every value in the new Sport column is going to be 'Basketball'

In [95]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball


In [97]:
nba['League'] = 'National Basketball Association'
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basketball Association


In [119]:
nba = pd.read_csv('nba.csv')

In [116]:
## Besides the assign command to add a new column, we can use the insert() method
## insert() has loc parameter, which is the location within the columns, eg: there are nine columns in the nba dataset
## Name is the zero index position, Team is the 1 index position
## if insert(loc=3) which means put new column to the third index position and push other variables to the right
nba.insert(3, column = 'Sport', value = 'Basketball')
## insert() permanently modifies the original dataframe, so we dno not need inplace parameter

In [117]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,


In [118]:
nba.insert(7, column = 'League', value = 'National Basketball Association')
nba.head(3)

Unnamed: 0,Name,Team,Number,Sport,Position,Age,Height,League,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,National Basketball Association,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,National Basketball Association,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,National Basketball Association,205.0,Boston University,


## Broadcasting Operations

In [125]:
nba['Age'].add(5).head(3)
nba['Age'].apply(lambda older: older + 5).head(3)
## the .add() and .apply() work on the NULL values as well, though it returns NULL values

0    30.0
1    30.0
2    32.0
Name: Age, dtype: float64

In [126]:
(nba['Age'] + 5).head(3)

0    30.0
1    30.0
2    32.0
Name: Age, dtype: float64

In [137]:
nba['Salary'].sub(5000000).head(3)
(nba['Salary']-5000000).head(3)

0    2730337.0
1    1796117.0
2          NaN
Name: Salary, dtype: float64

In [147]:
(nba['Weight'] * 0.45).head(3)
nba['Weight in Kilograms'] = nba['Weight'].mul(0.453592)

In [148]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Wight in Kilograms,Weight in Kilograms
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,81.64656
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412,106.59412
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636,92.98636


In [151]:
nba['Salary'].div(1000000)
nba['Salary in Millions'] = nba['Salary'] / 1000000
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Wight in Kilograms,Weight in Kilograms,Salary in Millions
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,81.64656,7.730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412,106.59412,6.796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636,92.98636,


## A Review of the .value_counts() Method

In [152]:
nba = pd.read_csv('nba.csv')

In [154]:
## the .value_counts() function only works on a series/list
nba['Team'].value_counts().head(3)

New Orleans Pelicans    19
Memphis Grizzlies       18
Milwaukee Bucks         16
Name: Team, dtype: int64

In [159]:
nba['Position'].value_counts().head(1)   ## shows the most popular position
nba['Salary'].value_counts().head(3)
## .value_counts() function can ignore the NULL values

947276.0    31
845059.0    18
525093.0    13
Name: Salary, dtype: int64

## Drop Rows with Null Values

In [174]:
## .dropna() function: by default, remove any rows from the dataframe that have any null values at all.
## If there is NULL value at any point in that row, the row is going to ve removed. default is .dropna(how='any')
nba = pd.read_csv('nba.csv')
nba.dropna()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0
10,Jared Sullinger,Boston Celtics,7.0,C,24.0,6-9,260.0,Ohio State,2569260.0
11,Isaiah Thomas,Boston Celtics,4.0,PG,27.0,5-9,185.0,Washington,6912869.0
12,Evan Turner,Boston Celtics,11.0,SG,27.0,6-7,220.0,Ohio State,3425510.0


In [161]:
## The parameter how='all' will remove the rows where all of the values are equal to NULL
nba.dropna(how='all').head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [175]:
nba.dropna(how='all', inplace = True)
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [169]:
## .dropna(axis = 1) removes any columns from the dataframe where there is any NULL values
nba.dropna(axis = 1).head(2)
nba.dropna(axis = 'columns').head(2)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0


In [179]:
## Now we want to remove the rows if there is NULL value in a specific column
## we can use .dropna(subset = []) parameter, the subset parameter can accept list of strings(columns) we care.
nba.dropna(subset =['Salary']).head(5) ## It removes the row only if there is NULL value in Salary column

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0


In [182]:
nba.dropna(subset=['Salary', 'College']).head()  ## remove rows that have NULLs in Salary or College

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0


## Fill in Null Values with the .fillna() Method

In [None]:
nba = pd.read_csv('nba.csv')
## .fillna() function is used to fill the NULL values in the dataframe
nba.fillna(0) ## This is not supposed to be satisfied. Because fill in every NULL value with the same value in a dataframe
## is not good, in a datafrme, there are different data types(string, floating).

In [185]:
nba['Salary'].fillna(inplace = True, value = 0)
nba.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [187]:
nba['College'].fillna(inplace = True, value = 'No College')
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0


## The .astype() Method

In [189]:
## .astype() functions allow to convert the data type of a series from one to another
## The requirement for the .astype() function is that it requires a series to not have any NULL values
nba = pd.read_csv('nba.csv').dropna(how = 'all')
nba['Salary'].fillna(0, inplace = True)
nba['College'].fillna('None', inplace = True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [197]:
nba.dtypes
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [200]:
nba['Salary'] = nba['Salary'].astype('int')
## Convert the Salary(float) to integer datatype, .astype() function does not ave the inplace parameter
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000


In [201]:
nba.info()  ## It can be observed that the memory usage has been decreased for the integer datatype

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int32
dtypes: float64(3), int32(1), object(5)
memory usage: 33.9+ KB


In [203]:
nba['Number'] = nba['Number'].astype('int')
nba['Age'] = nba['Age'].astype('int')
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99,SF,25,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30,SG,27,6-5,205.0,Boston University,0
3,R.J. Hunter,Boston Celtics,28,SG,22,6-5,185.0,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8,PF,29,6-10,231.0,,5000000


In [204]:
nba.info()  ## The memory usuage decrease

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null int32
Position    457 non-null object
Age         457 non-null int32
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int32
dtypes: float64(1), int32(3), object(5)
memory usage: 30.3+ KB


In [209]:
nba['Age'].astype('float')
## A new datatype called 'category' which only exists in pandas but not Python, the category can reduce the memory usage ideally
## Convert the Position(string) to category
nba['Position'] = nba['Position'].astype('category')
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null int32
Position    457 non-null category
Age         457 non-null int32
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int32
dtypes: category(1), float64(1), int32(3), object(4)
memory usage: 27.4+ KB


In [292]:
nba['Position'].value_counts()
## .nunique() function tells us how many unique values in a series
nba['Position'].nunique()

5

In [212]:
nba['Team'].nunique()
nba['Team'] = nba['Team'].astype('category')  # Though convert the datatype, the dataframe still looks the same as before
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null category
Number      457 non-null int32
Position    457 non-null category
Age         457 non-null int32
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int32
dtypes: category(2), float64(1), int32(3), object(3)
memory usage: 25.8+ KB


# Sort a DataFrame with the .sort_values() Function, Part I

In [213]:
nba = pd.read_csv('nba.csv')
nba.sort_values('Name', ascending = False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
237,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0
271,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
402,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
270,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,
386,Wilson Chandler,Denver Nuggets,21.0,SF,29.0,6-8,225.0,DePaul,10449438.0
25,Willie Reed,Brooklyn Nets,33.0,PF,26.0,6-10,220.0,Saint Louis,947276.0
141,Willie Cauley-Stein,Sacramento Kings,0.0,C,22.0,7-0,240.0,Kentucky,3398280.0
385,Will Barton,Denver Nuggets,5.0,SF,25.0,6-6,175.0,Memphis,3533333.0
233,Wesley Matthews,Dallas Mavericks,23.0,SG,29.0,6-5,220.0,Marquette,16407500.0
97,Wesley Johnson,Los Angeles Clippers,33.0,SF,28.0,6-7,215.0,Syracuse,1100602.0


In [219]:
nba.sort_values('Age', ascending = False).head(3)
nba.sort_values('Salary', ascending = False, inplace = True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0


In [224]:
## If a column has NULL values, by using .sort_values() function, the NULL values will be listed bottom by default,
## No matter ascending or descending order
nba.sort_values('Salary', ascending = False).tail(3)
nba.sort_values('Salary', na_position = 'last').tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
397,Axel Toupane,Denver Nuggets,6.0,SG,23.0,6-7,210.0,,
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,
457,,,,,,,,,


In [229]:
nba.sort_values('Salary', na_position = 'first', ascending = False).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
46,Elton Brand,Philadelphia 76ers,42.0,PF,37.0,6-9,254.0,Duke,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,


## Sort a DataFrame with the .sort_values() Method, Part II

In [247]:
nba = pd.read_csv('nba.csv')
## Sort a DataFrame by values in multiple columns, the columns sequences in the python list matters here
nba.sort_values(by =['Team', 'Name'], ascending = False).head(3)  ## the ascending parameter works on both columns

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
379,Ramon Sessions,Washington Wizards,7.0,PG,30.0,6-3,190.0,Nevada,2170465.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0


In [250]:
## Now we want to sort the Team by ascending order and the Name by descending order
nba.sort_values(by = ['Team', 'Name'], ascending = [True, False], inplace = True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24.0,6-6,205.0,Michigan,1304520.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0


## Sort DataFrame with the .sort_index() Method

In [252]:
nba = pd.read_csv('nba.csv')
nba.sort_values(['Number', 'Salary', 'Name'], inplace = True)
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
68,Lucas Nogueira,Toronto Raptors,92.0,C,23.0,7-0,220.0,,1842000.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
457,,,,,,,,,


In [257]:
## .sort_index() sort the index by ascending order
nba.sort_index(ascending = False, inplace = True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


## Rank Values with the .rank() Method

In [258]:
## .rank() generate a brand new series of ranks
## The .rank() function need to get rid of all the NULL values in that serie, .rank() function is called on a single series.
nba = pd.read_csv('nba.csv').dropna(how = 'all')
nba['Salary'] = nba['Salary'].fillna(0).astype('int')
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0


In [261]:
## Higher Salary possess the smaller rank
nba['Salary Rank'] = nba['Salary'].rank(ascending = False).astype('int')
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452


In [263]:
## Sort by the salary gives us the prove that we eank it correctly
nba.sort_values('Salary', ascending = False)
## If there are same values n a series, the .rank() function will give the same ranking and kip a few numbers after that

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
100,Chris Paul,Los Angeles Clippers,3.0,PG,31.0,6-0,175.0,Wake Forest,21468695,6
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27.0,6-9,240.0,Texas,20158622,7
164,Derrick Rose,Chicago Bulls,1.0,PG,27.0,6-3,190.0,Memphis,20093064,8
349,Dwyane Wade,Miami Heat,3.0,SG,34.0,6-4,220.0,Marquette,20000000,9
174,Kevin Love,Cleveland Cavaliers,0.0,PF,27.0,6-10,251.0,UCLA,19689000,11
