#### Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
city_data = {
"City": ["New York City", "Paris", "Barcelona", "Rome"],
"Country": ["United States", "France", "Spain", "Italy"],
"Population": [8600000, 2141000, 5515000, 2873000]
}

In [3]:
# create a DataFrame from a dictionary
city_df = pd.DataFrame(city_data)
city_df

Unnamed: 0,City,Country,Population
0,New York City,United States,8600000
1,Paris,France,2141000
2,Barcelona,Spain,5515000
3,Rome,Italy,2873000


**Transpose data**

In [4]:
city_df.transpose()

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


In [5]:
# the same result using the .T attribute
city_df.T

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


#### Creating a DataFrame from a NumPy ndarray

In [6]:
random_data = np.random.randint(1, 101, size=(3, 5))
random_data

array([[87, 98, 11, 96, 43],
       [48, 33, 89, 53, 21],
       [76, 52, 78, 11, 71]])

In [7]:
pd.DataFrame(data=random_data)

Unnamed: 0,0,1,2,3,4
0,87,98,11,96,43
1,48,33,89,53,21
2,76,52,78,11,71


**Create a dataframe manualy**

In [8]:
row_labels = ["Morning", "Afternoon", "Evening"]
temperatures = pd.DataFrame(data = random_data, index = row_labels)
temperatures

Unnamed: 0,0,1,2,3,4
Morning,87,98,11,96,43
Afternoon,48,33,89,53,21
Evening,76,52,78,11,71


In [9]:
row_labels = ["Morning", "Afternoon", "Evening"]
column_labels = (
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
)

In [10]:
pd.DataFrame(data=random_data, index = row_labels,columns = column_labels)

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday
Morning,87,98,11,96,43
Afternoon,48,33,89,53,21
Evening,76,52,78,11,71


**The Dataframe allows to have duplicates in the index and rows**

In [11]:
# Morning and Tuesday are duplicated
row_labels = ["Morning", "Afternoon", "Morning"]
column_lables_1 = [
    "Monday",
    "Tuesday",
    "Tuesday",
    "Tuesday",
    "Friday"
]

df_Tuesday = pd.DataFrame(
    data = random_data,
    index = row_labels,
    columns = column_lables_1
)

**Bad practise**

In [12]:
df_Tuesday['Tuesday']

Unnamed: 0,Tuesday,Tuesday.1,Tuesday.2
Morning,98,11,96
Afternoon,33,89,53
Morning,52,78,11


#### Import data with CSV

In [13]:
nba_path = '/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 4 Dataframe/Data/nba.csv'

In [16]:
df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])

  df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])


**Alternative way to parse date in the Dataframe**

In [15]:
df_nba = pd.read_csv(nba_path)
df_nba['Birthday'] = pd.to_datetime(df_nba['Birthday'], format='%m/%d/%y')
# change the date format to 'YYYY-MM-DD'
df_nba['Birthday'] = df_nba['Birthday'].dt.strftime('%Y-%m-%d')

In [17]:
df_nba.shape

(450, 5)

In [18]:
type(df_nba)

pandas.core.frame.DataFrame

In [19]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


#### Shared and Exclusive Attributes between Series and DataFrames

In [20]:
# check the data types of each column
df_nba.dtypes

Name                object
Team                object
Position            object
Birthday    datetime64[ns]
Salary               int64
dtype: object

In [21]:
# Count values for each data type
df_nba.dtypes.value_counts()

object            3
datetime64[ns]    1
int64             1
Name: count, dtype: int64

In [22]:
# View the index
df_nba.index

RangeIndex(start=0, stop=450, step=1)

In [23]:
# View the columns
df_nba.columns

Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [24]:
# method ndim shows the number of dimensions of the DataFrame = 2 and Series = 1
df_nba.ndim

2

In [25]:
# method shape shows the number of rows and columns
df_nba.shape

(450, 5)

In [26]:
# method size shows the number of elements in the DataFrame
df_nba.size

2250

In [27]:
# method count shows the number of non-null values in each column
df_nba.count()

Name        450
Team        450
Position    450
Birthday    450
Salary      450
dtype: int64

#### Shared Methods between Series and DataFrames

In [28]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [29]:
df_nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [30]:
# method sample shows random rows from the DataFrame
df_nba.sample(5)

Unnamed: 0,Name,Team,Position,Birthday,Salary
54,Patrick McCaw,Toronto Raptors,SF,1995-10-25,4000000
370,Bol Bol,Denver Nuggets,C,1999-11-16,79568
22,George Hill,Milwaukee Bucks,PG,1986-05-04,10133907
9,Torrey Craig,Denver Nuggets,SF,1990-12-19,2000000
311,Cody Martin,Charlotte Hornets,SF,1995-09-28,1173310


In [31]:
# method nunique shows the number of unique values in each column
df_nba.nunique()

Name        450
Team         30
Position      9
Birthday    430
Salary      269
dtype: int64

In [32]:
# method max shows the maximum value in each column. It's not one row with maximum values
df_nba.max()

Name             Zylan Cheatham
Team         Washington Wizards
Position                     SG
Birthday    2000-12-23 00:00:00
Salary                 40231758
dtype: object

In [33]:
df_nba[df_nba['Name'] == 'Zylan Cheatham']

Unnamed: 0,Name,Team,Position,Birthday,Salary
248,Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568


In [34]:
# method min shows the minimum value in each column. It's not one row with minimum values
df_nba.min()

Name               Aaron Gordon
Team              Atlanta Hawks
Position                      C
Birthday    1977-01-26 00:00:00
Salary                    79568
dtype: object

In [35]:
# method nlargest shows the n largest values in a column
df_nba.nlargest(n=4, columns='Salary')

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000


In [36]:
# method nsmallest shows the n smallest values in a column
df_nba.nsmallest(n=4, columns='Birthday')

Unnamed: 0,Name,Team,Position,Birthday,Salary
98,Vince Carter,Atlanta Hawks,PF,1977-01-26,2564753
196,Udonis Haslem,Miami Heat,C,1980-06-09,2564753
262,Kyle Korver,Milwaukee Bucks,PF,1981-03-17,6004753
149,Tyson Chandler,Houston Rockets,C,1982-10-02,2564753


**Find the salary sum**

In [37]:
# numeric_only parameter returns only numeric columns
df_nba.sum(numeric_only=True)

Salary    3444112694
dtype: int64

**Average salary in the NBA**

In [38]:
# method mean shows the mean value in each numeric column
df_nba.mean(numeric_only=True).round(2)

Salary    7653583.76
dtype: float64

In [39]:
# method median shows the median value in each numeric column
df_nba.median(numeric_only=True).round(2)

Salary    3303074.5
dtype: float64

In [40]:
# method mode shows the most frequently occurring value in each column
df_nba.mode(numeric_only=True)

Unnamed: 0,Salary
0,79568


In [41]:
# method std shows the standard deviation in each numeric column
df_nba.std(numeric_only=True).round(2)

Salary    9288810.3
dtype: float64

#### Sorting a DataFrame

In [42]:
# Sorting by a single column
df_nba.sort_values(by='Salary', ascending=False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000
264,James Harden,Houston Rockets,PG,1989-08-26,38199000


In [44]:
# Same result using the .sort_index() method    
df_nba.sort_values('Name').head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
52,Aaron Gordon,Orlando Magic,PF,1995-09-16,19863636
101,Aaron Holiday,Indiana Pacers,PG,1996-09-30,2239200
437,Abdel Nader,Oklahoma City Thunder,SF,1993-09-25,1618520
81,Adam Mokoka,Chicago Bulls,G,1998-07-18,79568
399,Admiral Schofield,Washington Wizards,SF,1997-03-30,1000000


**Sorting by few columns**

In [None]:
# using different sorting orders for multiple columns ascending=[True, False]
df_nba.sort_values(by=['Team', 'Salary'], ascending=[True, False]).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
111,Chandler Parsons,Atlanta Hawks,SF,1988-10-25,25102512
28,Evan Turner,Atlanta Hawks,PG,1988-10-27,18606556
167,Allen Crabbe,Atlanta Hawks,SG,1992-04-09,18500000
213,De'Andre Hunter,Atlanta Hawks,SF,1997-12-02,7068360
339,Jabari Parker,Atlanta Hawks,PF,1995-03-15,6500000


**Sorting by index**

In [46]:
df_nba.sort_index(ascending=True).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [None]:
# Last row in the DataFrame will be the first one
df_nba.sort_index(ascending=False).head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310


**Sorting by Column Index**

In [None]:

df_nba.sort_index(axis='columns').head()

Unnamed: 0,Birthday,Name,Position,Salary,Team
0,1996-09-26,Shake Milton,SG,1445697,Philadelphia 76ers
1,1995-09-27,Christian Wood,PF,1645357,Detroit Pistons
2,1998-08-23,PJ Washington,PF,3831840,Charlotte Hornets
3,1988-10-04,Derrick Rose,PG,7317074,Detroit Pistons
4,1995-07-26,Marial Shayok,G,79568,Philadelphia 76ers


In [51]:
df_nba.sort_index(axis=1).head()

Unnamed: 0,Birthday,Name,Position,Salary,Team
0,1996-09-26,Shake Milton,SG,1445697,Philadelphia 76ers
1,1995-09-27,Christian Wood,PF,1645357,Detroit Pistons
2,1998-08-23,PJ Washington,PF,3831840,Charlotte Hornets
3,1988-10-04,Derrick Rose,PG,7317074,Detroit Pistons
4,1995-07-26,Marial Shayok,G,79568,Philadelphia 76ers


In [None]:
# change the order of columns in the DataFrame using column index
df_nba.sort_index(axis='columns', ascending=False).head()

Unnamed: 0,Team,Salary,Position,Name,Birthday
0,Philadelphia 76ers,1445697,SG,Shake Milton,1996-09-26
1,Detroit Pistons,1645357,PF,Christian Wood,1995-09-27
2,Charlotte Hornets,3831840,PF,PJ Washington,1998-08-23
3,Detroit Pistons,7317074,PG,Derrick Rose,1988-10-04
4,Philadelphia 76ers,79568,G,Marial Shayok,1995-07-26
