In [1]:
import pandas as pd

# # DataFrame attributes

In [8]:
# Importing a DataSet.

nba = pd.read_csv('Datasets/nba.csv')

In [10]:
# df.head(n) will return a new dataframe with n top most rows.

nba.head(n=3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [11]:
# df.tail(n) will return a new dataframe with n bottom most rows.

nba.tail(n=3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


#### # df.index will return a range, for numeric indexs or a list of strings, for string-indexes.

In [13]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [15]:
# df.values returns an array of arrays consisting of row values.

nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [16]:
# df.shape gives the number of rows & columns

nba.shape

(458, 9)

In [17]:
# df.columns returns a array of column-headers.

nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [18]:
# df.axes returns an array of vertical(rows) & horizontal(columns) DataFrame headers.

nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [19]:
# df.info() gives info of non-NaN values in DataFrame.

nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [22]:
# Importing a DataSet.

rev = pd.read_csv('Datasets/revenue.csv', index_col='Date')
rev.head(3)

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933


## .sum() Method

In [24]:
s = pd.Series([1, 2, 3])
s.sum()

6

In [28]:
rev.sum(axis=0)          # axis=0 sums all values in a column.

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [29]:
rev.sum(axis=1).head(3)  # axis=1 sums all the values in a row.

Date
1/1/16    1606
1/2/16    2060
1/3/16     967
dtype: int64

# # Extracting entire Columns

In [32]:
# df.column_name
# this method is prone to error if column name has spaces.

nba.Number

0       0.0
1      99.0
2      30.0
3      28.0
4       8.0
       ... 
453     8.0
454    25.0
455    21.0
456    24.0
457     NaN
Name: Number, Length: 458, dtype: float64

In [37]:
# df['column_name']
# Returns the colunm as a series.
# Series index is same as DataFrame index.

nba['Name'].head(3)

0    Avery Bradley
1      Jae Crowder
2     John Holland
Name: Name, dtype: object

### # Extracting 2 or more columns.

#### pass a list of column names insted of the names of a column directly.

In [40]:
selection = ['Name', 'Team', 'Salary']
nba[['Name', 'Team', 'Salary']]           # Does the same thing as below
nba[selection].head()

Unnamed: 0,Name,Team,Salary
0,Avery Bradley,Boston Celtics,7730337.0
1,Jae Crowder,Boston Celtics,6796117.0
2,John Holland,Boston Celtics,
3,R.J. Hunter,Boston Celtics,1148640.0
4,Jonas Jerebko,Boston Celtics,5000000.0


# # Adding Columns to a DataFrame

In [44]:
# adding a column named 'Sport'
# assigning a string fills all the values with that string in that column.

nba['Sport'] = 'Basketball'
nba.head()

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,NBA,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,NBA,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,NBA,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,NBA,PF,29.0,6-10,231.0,,5000000.0,Basketball


### # df.insert() method.

#### df.insert() method is more specific.

#### column-headers have an implicit numeric index starting from 0 to the leftmost column of the DataFrame.

In [46]:
# Inserting column named 'League' @ 3rd column-index position.
# one can pass a series or an array as value.

nba.insert(loc=3, column='League', value='NBA')
nba.head()

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,NBA,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,NBA,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,NBA,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,NBA,PF,29.0,6-10,231.0,,5000000.0,Basketball


# # Broadcasting methods

#### broadcasting methods are the methods that are applied on each and every value in the series.

In [9]:
# adding 500000 to each value in Salary column.

# nba['Salary'] + 500000    # Same as below

nba['Salary'].add(500000)

0      8230337.0
1      7296117.0
2            NaN
3      1648640.0
4      5500000.0
         ...    
453    2933333.0
454    1400000.0
455    3400000.0
456    1447276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [49]:
# Subtracting 500000 from each value in Salary column.

# nba['Salary'] - 500000     # Same as below

nba['Salary'].sub(500000)

0      7230337.0
1      6296117.0
2            NaN
3       648640.0
4      4500000.0
         ...    
453    1933333.0
454     400000.0
455    2400000.0
456     447276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [50]:
# Multiplying each value in 'Weight' column with 0.45, converting to kg.
# Assigning the output series to a new column.

# nba['Weight in kg'] = nba['Weight'] * 0.453592  # Same as below

nba['Weight in kg'] = nba['Weight'].mul(0.453592)
nba.head(3)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
0,Avery Bradley,Boston Celtics,0.0,NBA,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,81.64656
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,106.59412
2,John Holland,Boston Celtics,30.0,NBA,SG,27.0,6-5,205.0,Boston University,,Basketball,92.98636


In [51]:
# Dividing each value in 'Salary' column with 1m.

# nba['Salary'] / 1000000     # Same as below

nba['Salary'].div(1000000)

0      7.730337
1      6.796117
2           NaN
3      1.148640
4      5.000000
         ...   
453    2.433333
454    0.900000
455    2.900000
456    0.947276
457         NaN
Name: Salary, Length: 458, dtype: float64

# # Dealing with NaN (Null values).

In [54]:
# the DataFrame has a Null row in the bottom.

nba.tail(3)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
455,Tibor Pleiss,Utah Jazz,21.0,NBA,C,26.0,7-3,256.0,,2900000.0,Basketball,116.119552
456,Jeff Withey,Utah Jazz,24.0,NBA,C,26.0,7-0,231.0,Kansas,947276.0,Basketball,104.779752
457,,,,NBA,,,,,,,Basketball,


## # df.dropna() method.

#### axis kwarg takes the axis on which to look for NaN values and drop, 0 for vertical & 1 for horizontal. One can also use 'columns'/'rows'.
#### how kwarg takes the condition to look for, 'any' to drop if even 1 value is NaN, 'all' to drop if all values are NaN.
#### thresh kwarg takes the minimum number of real values to keep the axes, if less, drop axes.
#### subset kwarg takes a list of other axis-headers to consider, ie. if value on that header is NaN, drop axes.
#### inplace kwarg takes a boolean, If set True makes permanent changes to the applied DataFrame.

In [57]:
nba.dropna(axis=0, how='any', thresh=None, subset=['Salary'], inplace=False)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
0,Avery Bradley,Boston Celtics,0.0,NBA,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,81.646560
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,106.594120
3,R.J. Hunter,Boston Celtics,28.0,NBA,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,83.914520
4,Jonas Jerebko,Boston Celtics,8.0,NBA,PF,29.0,6-10,231.0,,5000000.0,Basketball,104.779752
5,Amir Johnson,Boston Celtics,90.0,NBA,PF,29.0,6-9,240.0,,12000000.0,Basketball,108.862080
...,...,...,...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,NBA,PF,20.0,6-10,234.0,Kentucky,2239800.0,Basketball,106.140528
453,Shelvin Mack,Utah Jazz,8.0,NBA,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball,92.079176
454,Raul Neto,Utah Jazz,25.0,NBA,PG,24.0,6-1,179.0,,900000.0,Basketball,81.192968
455,Tibor Pleiss,Utah Jazz,21.0,NBA,C,26.0,7-3,256.0,,2900000.0,Basketball,116.119552


## # df.fillna() method.

#### Directly applying df.fillna() on a dataframe is crude, if you pass a value it will be applied on all the NaN values across all the columns.
#### Apply df.fillna() method on the column extracted from the dataframe insted.

In [59]:
# Filling all the Null values with 0.0

nba.fillna(value=0.0)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
0,Avery Bradley,Boston Celtics,0.0,NBA,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,81.646560
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,106.594120
2,John Holland,Boston Celtics,30.0,NBA,SG,27.0,6-5,205.0,Boston University,0.0,Basketball,92.986360
3,R.J. Hunter,Boston Celtics,28.0,NBA,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,83.914520
4,Jonas Jerebko,Boston Celtics,8.0,NBA,PF,29.0,6-10,231.0,0,5000000.0,Basketball,104.779752
...,...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,NBA,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball,92.079176
454,Raul Neto,Utah Jazz,25.0,NBA,PG,24.0,6-1,179.0,0,900000.0,Basketball,81.192968
455,Tibor Pleiss,Utah Jazz,21.0,NBA,C,26.0,7-3,256.0,0,2900000.0,Basketball,116.119552
456,Jeff Withey,Utah Jazz,24.0,NBA,C,26.0,7-0,231.0,Kansas,947276.0,Basketball,104.779752


In [60]:
# Extracting the column.

nba['Salary']

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [61]:
# applying .fillna() to the extracted series.

nba['Salary'].fillna(value=0.0, inplace=True)

In [62]:
nba['Salary']

0      7730337.0
1      6796117.0
2            0.0
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          0.0
Name: Salary, Length: 458, dtype: float64

In [64]:
nba['College']

0                  Texas
1              Marquette
2      Boston University
3          Georgia State
4                    NaN
             ...        
453               Butler
454                  NaN
455                  NaN
456               Kansas
457                  NaN
Name: College, Length: 458, dtype: object

In [65]:
# Filling all NaN values in 'College' column with 'No College'

nba['College'].fillna(value='No College', inplace=True)

In [66]:
nba['College']

0                  Texas
1              Marquette
2      Boston University
3          Georgia State
4             No College
             ...        
453               Butler
454           No College
455           No College
456               Kansas
457           No College
Name: College, Length: 458, dtype: object

# # df.astype() method.

#### df.astype() sets the format of the values in the columns.
#### df.astype() requires all the values to be non-Null.

In [69]:


nba['Salary'] = nba['Salary'].astype('int32')

In [71]:
nba['Age'] = nba['Age'].fillna(0)

In [72]:
nba['Age'] = nba['Age'].astype('int32')

In [74]:
nba['Team'].fillna('No Team', inplace=True)

#### # A "category" type can be used when there are a lot of duplicate values in a column, eg. Male or Female in a gender column.

In [76]:
nba['Team'] = nba['Team'].astype('category')

In [77]:
nba.info()

# Here, Team column is converted to category type, significantly reducing size.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 12 columns):
Name            457 non-null object
Team            458 non-null category
Number          457 non-null float64
League          458 non-null object
Position        457 non-null object
Age             458 non-null int32
Height          457 non-null object
Weight          457 non-null float64
College         458 non-null object
Salary          458 non-null int32
Sport           458 non-null object
Weight in kg    457 non-null float64
dtypes: category(1), float64(3), int32(2), object(6)
memory usage: 37.8+ KB


# # Sorting a DataFrame.

## # Sorting by single column.

#### By kwarg takes a column-name or a list of column-names to sort by.

In [80]:
# sorting the DataFrame by names.

nba.sort_values(by='Name')

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
152,Aaron Brooks,Chicago Bulls,0.0,NBA,PG,31,6-0,161.0,Oregon,2250000,Basketball,73.028312
356,Aaron Gordon,Orlando Magic,0.0,NBA,PF,20,6-9,220.0,Arizona,4171680,Basketball,99.790240
328,Aaron Harrison,Charlotte Hornets,9.0,NBA,SG,21,6-6,210.0,Kentucky,525093,Basketball,95.254320
404,Adreian Payne,Minnesota Timberwolves,33.0,NBA,PF,25,6-10,237.0,Michigan State,1938840,Basketball,107.501304
312,Al Horford,Atlanta Hawks,15.0,NBA,C,30,6-10,245.0,Florida,12000000,Basketball,111.130040
...,...,...,...,...,...,...,...,...,...,...,...,...
270,Xavier Munford,Memphis Grizzlies,14.0,NBA,PG,24,6-3,180.0,Rhode Island,0,Basketball,81.646560
402,Zach LaVine,Minnesota Timberwolves,8.0,NBA,PG,21,6-5,189.0,UCLA,2148360,Basketball,85.728888
271,Zach Randolph,Memphis Grizzlies,50.0,NBA,PF,34,6-9,260.0,Michigan State,9638555,Basketball,117.933920
237,Zaza Pachulia,Dallas Mavericks,27.0,NBA,C,32,6-11,275.0,No College,5200000,Basketball,124.737800


## # Sorting by multiple columns

#### While sorting by multiple columns pandas will sort the rows with the first column and then the second column.
#### Passing a list of booleans to "ascending" kwarg will apply the first boolean to first column in "by" kwarg.

In [81]:
# Sorting teams in ascending order & Salary from highest to lowest.

nba.sort_values(by=['Team', 'Salary'], ascending=[True, False])

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
315,Paul Millsap,Atlanta Hawks,4.0,NBA,PF,31,6-8,246.0,Louisiana Tech,18671659,Basketball,111.583632
312,Al Horford,Atlanta Hawks,15.0,NBA,C,30,6-10,245.0,Florida,12000000,Basketball,111.130040
321,Tiago Splitter,Atlanta Hawks,11.0,NBA,C,31,6-11,245.0,No College,9756250,Basketball,111.130040
323,Jeff Teague,Atlanta Hawks,0.0,NBA,PG,27,6-2,186.0,Wake Forest,8000000,Basketball,84.368112
314,Kyle Korver,Atlanta Hawks,26.0,NBA,SG,35,6-7,212.0,Creighton,5746479,Basketball,96.161504
...,...,...,...,...,...,...,...,...,...,...,...,...
377,Kelly Oubre Jr.,Washington Wizards,12.0,NBA,SF,20,6-7,205.0,Kansas,1920240,Basketball,92.986360
380,Garrett Temple,Washington Wizards,17.0,NBA,SG,30,6-6,195.0,LSU,1100602,Basketball,88.450440
371,Jarell Eddie,Washington Wizards,8.0,NBA,SG,24,6-7,218.0,Virginia Tech,561716,Basketball,98.883056
374,JJ Hickson,Washington Wizards,21.0,NBA,C,27,6-9,242.0,North Carolina State,273038,Basketball,109.769264


## # df.Sort_index() method

In [82]:
nba.sort_index(ascending=False)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg
457,,No Team,,NBA,,0,,,No College,0,Basketball,
456,Jeff Withey,Utah Jazz,24.0,NBA,C,26,7-0,231.0,Kansas,947276,Basketball,104.779752
455,Tibor Pleiss,Utah Jazz,21.0,NBA,C,26,7-3,256.0,No College,2900000,Basketball,116.119552
454,Raul Neto,Utah Jazz,25.0,NBA,PG,24,6-1,179.0,No College,900000,Basketball,81.192968
453,Shelvin Mack,Utah Jazz,8.0,NBA,PG,26,6-3,203.0,Butler,2433333,Basketball,92.079176
...,...,...,...,...,...,...,...,...,...,...,...,...
4,Jonas Jerebko,Boston Celtics,8.0,NBA,PF,29,6-10,231.0,No College,5000000,Basketball,104.779752
3,R.J. Hunter,Boston Celtics,28.0,NBA,SG,22,6-5,185.0,Georgia State,1148640,Basketball,83.914520
2,John Holland,Boston Celtics,30.0,NBA,SG,27,6-5,205.0,Boston University,0,Basketball,92.986360
1,Jae Crowder,Boston Celtics,99.0,NBA,SF,25,6-6,235.0,Marquette,6796117,Basketball,106.594120


# # df.column_name.rank() method.

#### It returns a Series consisting of ranks of values in the column.
#### .rank() method by default ranks the values in reverse ie. the lowest values are given a low rank(1st, 2nd, etc) wherelse a high number is given a high rank.   
#### If you want to rank low for high values eg. rank 1 for highest Salary set "ascending=False".

In [89]:
 # Ranking the salaries, high salaries get low rank.
 # assigning the resulting series to a new column in DataFrame. 
 
 nba['Salary Rank'] = nba['Salary'].rank(ascending=False).astype('int32')

In [94]:
nba.sort_values('Salary', ascending=False).head(15)

Unnamed: 0,Name,Team,Number,League,Position,Age,Height,Weight,College,Salary,Sport,Weight in kg,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,NBA,SF,37,6-6,212.0,No College,25000000,Basketball,96.161504,1
169,LeBron James,Cleveland Cavaliers,23.0,NBA,SF,31,6-8,250.0,No College,22970500,Basketball,113.398,2
33,Carmelo Anthony,New York Knicks,7.0,NBA,SF,32,6-8,240.0,Syracuse,22875000,Basketball,108.86208,3
251,Dwight Howard,Houston Rockets,12.0,NBA,C,30,6-11,265.0,No College,22359364,Basketball,120.20188,4
339,Chris Bosh,Miami Heat,1.0,NBA,PF,32,6-11,235.0,Georgia Tech,22192730,Basketball,106.59412,5
100,Chris Paul,Los Angeles Clippers,3.0,NBA,PG,31,6-0,175.0,Wake Forest,21468695,Basketball,79.3786,6
414,Kevin Durant,Oklahoma City Thunder,35.0,NBA,SF,27,6-9,240.0,Texas,20158622,Basketball,108.86208,7
164,Derrick Rose,Chicago Bulls,1.0,NBA,PG,27,6-3,190.0,Memphis,20093064,Basketball,86.18248,8
349,Dwyane Wade,Miami Heat,3.0,NBA,SG,34,6-4,220.0,Marquette,20000000,Basketball,99.79024,9
23,Brook Lopez,Brooklyn Nets,11.0,NBA,C,28,7-0,275.0,Stanford,19689000,Basketball,124.7378,11
