#### Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
city_data = {
"City": ["New York City", "Paris", "Barcelona", "Rome"],
"Country": ["United States", "France", "Spain", "Italy"],
"Population": [8600000, 2141000, 5515000, 2873000]
}

In [7]:
# create a DataFrame from a dictionary
city_df = pd.DataFrame(city_data)
city_df

Unnamed: 0,City,Country,Population
0,New York City,United States,8600000
1,Paris,France,2141000
2,Barcelona,Spain,5515000
3,Rome,Italy,2873000


**Transpose data**

In [8]:
city_df.transpose()

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


In [9]:
# the same result using the .T attribute
city_df.T

Unnamed: 0,0,1,2,3
City,New York City,Paris,Barcelona,Rome
Country,United States,France,Spain,Italy
Population,8600000,2141000,5515000,2873000


#### Creating a DataFrame from a NumPy ndarray

In [20]:
random_data = np.random.randint(1, 101, size=(3, 5))
random_data

array([[67, 57, 27, 14, 28],
       [35,  3, 46, 75, 63],
       [30, 22, 92,  1, 86]])

In [34]:
pd.DataFrame(data=random_data)

Unnamed: 0,0,1,2,3,4
0,38,21,50,70,14
1,62,68,66,6,82
2,80,36,14,36,18


**Create a dataframe manualy**

In [35]:
row_labels = ["Morning", "Afternoon", "Evening"]
temperatures = pd.DataFrame(data = random_data, index = row_labels)
temperatures

Unnamed: 0,0,1,2,3,4
Morning,38,21,50,70,14
Afternoon,62,68,66,6,82
Evening,80,36,14,36,18


In [36]:
row_labels = ["Morning", "Afternoon", "Evening"]
column_labels = (
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
)

In [37]:
pd.DataFrame(data=random_data, index = row_labels,columns = column_labels)

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday
Morning,38,21,50,70,14
Afternoon,62,68,66,6,82
Evening,80,36,14,36,18


**The Dataframe allows to have duplicates in the index and rows**

In [43]:
# Morning and Tuesday are duplicated
row_labels = ["Morning", "Afternoon", "Morning"]
column_lables_1 = [
    "Monday",
    "Tuesday",
    "Tuesday",
    "Tuesday",
    "Friday"
]

df_Tuesday = pd.DataFrame(
    data = random_data,
    index = row_labels,
    columns = column_lables_1
)

**Bad practise**

In [44]:
df_Tuesday['Tuesday']

Unnamed: 0,Tuesday,Tuesday.1,Tuesday.2
Morning,21,50,70
Afternoon,68,66,6
Morning,36,14,36


#### Import data with CSV

In [54]:
nba_path = '/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 4 Dataframe/Data/nba.csv'

In [70]:
df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])

  df_nba = pd.read_csv(nba_path, parse_dates=['Birthday'])


**Alternative way to parse date in the Dataframe**

In [None]:
df_nba = pd.read_csv(nba_path)
df_nba['Birthday'] = pd.to_datetime(df_nba['Birthday'], format='%m/%d/%y')
# change the date format to 'YYYY-MM-DD'
df_nba['Birthday'] = df_nba['Birthday'].dt.strftime('%Y-%m-%d')

In [66]:
df_nba.shape

(450, 5)

In [67]:
type(df_nba)

pandas.core.frame.DataFrame

In [68]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


#### Shared and Exclusive Attributes between Series and DataFrames

In [71]:
# check the data types of each column
df_nba.dtypes

Name                object
Team                object
Position            object
Birthday    datetime64[ns]
Salary               int64
dtype: object

In [72]:
# Count values for each data type
df_nba.dtypes.value_counts()

object            3
datetime64[ns]    1
int64             1
Name: count, dtype: int64

In [73]:
# View the index
df_nba.index

RangeIndex(start=0, stop=450, step=1)

In [74]:
# View the columns
df_nba.columns

Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')

In [None]:
# method ndim shows the number of dimensions of the DataFrame = 2 and Series = 1
df_nba.ndim

2

In [76]:
# method shape shows the number of rows and columns
df_nba.shape

(450, 5)

In [77]:
# method size shows the number of elements in the DataFrame
df_nba.size

2250

In [78]:
# method count shows the number of non-null values in each column
df_nba.count()

Name        450
Team        450
Position    450
Birthday    450
Salary      450
dtype: int64

#### Shared Methods between Series and DataFrames

In [79]:
df_nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,1996-09-26,1445697
1,Christian Wood,Detroit Pistons,PF,1995-09-27,1645357
2,PJ Washington,Charlotte Hornets,PF,1998-08-23,3831840
3,Derrick Rose,Detroit Pistons,PG,1988-10-04,7317074
4,Marial Shayok,Philadelphia 76ers,G,1995-07-26,79568


In [80]:
df_nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
445,Austin Rivers,Houston Rockets,PG,1992-08-01,2174310
446,Harry Giles,Sacramento Kings,PF,1998-04-22,2578800
447,Robin Lopez,Milwaukee Bucks,C,1988-04-01,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,1999-01-04,4764960
449,Ricky Rubio,Phoenix Suns,PG,1990-10-21,16200000


In [81]:
# method sample shows random rows from the DataFrame
df_nba.sample(5)

Unnamed: 0,Name,Team,Position,Birthday,Salary
25,Tyler Cook,Cleveland Cavaliers,PF,1997-09-23,79568
44,Darius Bazley,Oklahoma City Thunder,PF,2000-06-12,2284800
242,Fred VanVleet,Toronto Raptors,PG,1994-02-25,9346153
281,James Ennis,Philadelphia 76ers,SF,1990-07-01,1882867
411,Chris Clemons,Houston Rockets,SG,1997-07-23,79568


In [82]:
# method nunique shows the number of unique values in each column
df_nba.nunique()

Name        450
Team         30
Position      9
Birthday    430
Salary      269
dtype: int64

In [None]:
# method max shows the maximum value in each column. It's not one row with maximum values
df_nba.max()

Name             Zylan Cheatham
Team         Washington Wizards
Position                     SG
Birthday    2000-12-23 00:00:00
Salary                 40231758
dtype: object

In [84]:
df_nba[df_nba['Name'] == 'Zylan Cheatham']

Unnamed: 0,Name,Team,Position,Birthday,Salary
248,Zylan Cheatham,New Orleans Pelicans,SF,1995-11-17,79568


In [85]:
# method min shows the minimum value in each column. It's not one row with minimum values
df_nba.min()

Name               Aaron Gordon
Team              Atlanta Hawks
Position                      C
Birthday    1977-01-26 00:00:00
Salary                    79568
dtype: object

In [88]:
# method nlargest shows the n largest values in a column
df_nba.nlargest(n=4, columns='Salary')

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,1988-03-14,40231758
38,Chris Paul,Oklahoma City Thunder,PG,1985-05-06,38506482
219,Russell Westbrook,Houston Rockets,PG,1988-11-12,38506482
251,John Wall,Washington Wizards,PG,1990-09-06,38199000


In [89]:
# method nsmallest shows the n smallest values in a column
df_nba.nsmallest(n=4, columns='Birthday')

Unnamed: 0,Name,Team,Position,Birthday,Salary
98,Vince Carter,Atlanta Hawks,PF,1977-01-26,2564753
196,Udonis Haslem,Miami Heat,C,1980-06-09,2564753
262,Kyle Korver,Milwaukee Bucks,PF,1981-03-17,6004753
149,Tyson Chandler,Houston Rockets,C,1982-10-02,2564753


**Find the salary sum**

In [None]:
# numeric_only parameter returns only numeric columns
df_nba.sum(numeric_only=True)

Salary    3444112694
dtype: int64