In [2]:
import pandas as pd      #Used to import the Pandas library and assign it the alias "pd" for more convenient usage.

In [3]:
df = pd.read_csv("healthexp.csv")    #Reads data from a CSV file and stores it in a Pandas DataFrame named "df," making the data easily accessible.

In [4]:
df.head()    #Used to display the first few rows of a Pandas DataFrame/df. By default, it shows the first 5 rows.

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [5]:
df['Country'].dtype    #Used to check the data type of the column named "Country" within the DataFrame.

dtype('O')

## Attributes

In [6]:
df.dtypes     #Used to display the data types of all the columns in df.

Year                 int64
Country             object
Spending_USD       float64
Life_Expectancy    float64
dtype: object

In [7]:
df.columns   #Returns the names of all the columns in the Pandas DataFrame.

Index(['Year', 'Country', 'Spending_USD', 'Life_Expectancy'], dtype='object')

In [8]:
df.axes      #Returns a list of axes for the df, including both the row labels (the index) and column labels.

[RangeIndex(start=0, stop=274, step=1),
 Index(['Year', 'Country', 'Spending_USD', 'Life_Expectancy'], dtype='object')]

In [9]:
df.ndim     #Returns the number of dimensions (usually 2 for rows and columns) in a df.

2

In [10]:
df.size  #Used to determine the total number of elements (cells) in df.

1096

In [11]:
df.shape   #Used to retrieve the dimensions (number of rows and columns) of a df.

(274, 4)

In [12]:
df.values   #Used to return the underlying NumPy array containing the data in a df.

array([[1970, 'Germany', 252.311, 70.6],
       [1970, 'France', 192.143, 72.2],
       [1970, 'Great Britain', 123.993, 71.9],
       ...,
       [2020, 'Great Britain', 5018.7, 80.4],
       [2020, 'Japan', 4665.641, 84.7],
       [2020, 'USA', 11859.179, 77.0]], dtype=object)

## Methods

In [13]:
#Method is a function that is associated with an object. Methods are used to perform actions or operations on an object.

In [14]:
df.describe()   #is a Pandas df method that provides summary statistics (count, mean, min, max etc.) for the numerical columns in df.

Unnamed: 0,Year,Spending_USD,Life_Expectancy
count,274.0,274.0,274.0
mean,1996.992701,2789.338905,77.909489
std,14.180933,2194.939785,3.276263
min,1970.0,123.993,70.6
25%,1985.25,1038.357,75.525
50%,1998.0,2295.578,78.1
75%,2009.0,4055.61,80.575
max,2020.0,11859.179,84.7


In [15]:
df.max()    #method in Pandas is used to find the maximum value in each numerical column of a DataFrame. 

Year                    2020
Country                  USA
Spending_USD       11859.179
Life_Expectancy         84.7
dtype: object

In [16]:
df.min()    #method in Pandas is used to find the minimum value in each numerical column of a DataFrame. 

Year                  1970
Country             Canada
Spending_USD       123.993
Life_Expectancy       70.6
dtype: object

In [17]:
df.mean()    # this method calculates the average of each numerical column in a DataFrame. 

  df.mean()    # this method calculates the average of each numerical column in a DataFrame.


Year               1996.992701
Spending_USD       2789.338905
Life_Expectancy      77.909489
dtype: float64

In [18]:
df.median()  #method that calculates the middle value for each numerical column, providing a measure of the central tendency for the data.

  df.median()  #method that calculates the middle value for each numerical column, providing a measure of the central tendency for the data.


Year               1998.000
Spending_USD       2295.578
Life_Expectancy      78.100
dtype: float64

In [19]:
df.std()  #method in Pandas is used to calculate the standard deviation of each numerical column in a DF.

  df.std()  #method in Pandas is used to calculate the standard deviation of each numerical column in a DF.


Year                 14.180933
Spending_USD       2194.939785
Life_Expectancy       3.276263
dtype: float64

In [20]:
df.sample()   #method is used to randomly select a specified number of rows (or a fraction of rows) from a DataFrame. 

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
106,1993,Canada,1930.889,77.8


In [21]:
df.dropna() #method in Pandas is used to remove rows with missing/NaN values. 

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
...,...,...,...,...
269,2020,Germany,6938.983,81.1
270,2020,France,5468.418,82.3
271,2020,Great Britain,5018.700,80.4
272,2020,Japan,4665.641,84.7


In [22]:
df[["Life_Expectancy"]]    #used to select and display a single column.

Unnamed: 0,Life_Expectancy
0,70.6
1,72.2
2,71.9
3,72.0
4,70.9
...,...
269,81.1
270,82.3
271,80.4
272,84.7


In [23]:
df.Life_Expectancy    #another way to select and display a single column.

0      70.6
1      72.2
2      71.9
3      72.0
4      70.9
       ... 
269    81.1
270    82.3
271    80.4
272    84.7
273    77.0
Name: Life_Expectancy, Length: 274, dtype: float64

In [24]:
df.Spending_USD

0        252.311
1        192.143
2        123.993
3        150.437
4        326.961
         ...    
269     6938.983
270     5468.418
271     5018.700
272     4665.641
273    11859.179
Name: Spending_USD, Length: 274, dtype: float64

In [25]:
df[['Spending_USD','Life_Expectancy']]    #used to select and display specific columns.

Unnamed: 0,Spending_USD,Life_Expectancy
0,252.311,70.6
1,192.143,72.2
2,123.993,71.9
3,150.437,72.0
4,326.961,70.9
...,...,...
269,6938.983,81.1
270,5468.418,82.3
271,5018.700,80.4
272,4665.641,84.7


In [26]:
df_exp = df.groupby(['Life_Expectancy'])  #Used to group the rows in a df, by the unique values in the "Life_Expectancy" column.This code creates a new object, df_exp(Pandas GroupBy object). 

In [27]:
df_exp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CAAFC56FD0>

In [28]:
df_exp.mean()

  df_exp.mean()


Unnamed: 0_level_0,Year,Spending_USD
Life_Expectancy,Unnamed: 1_level_1,Unnamed: 2_level_1
70.6,1970.0,252.3110
70.8,1971.0,298.2510
70.9,1970.0,326.9610
71.0,1972.0,337.3640
71.2,1971.5,377.5425
...,...,...
84.1,2016.0,4295.8580
84.2,2017.0,4412.8520
84.3,2018.0,4554.2760
84.4,2019.0,4610.7940


In [29]:
df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [30]:
df.iloc[[0]]   # iloc(integer-location) used to select a specific row from a df (within index 0)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6


In [31]:
df.iloc[[4]]

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
4,1970,USA,326.961,70.9


In [32]:
df.iloc[[1]]

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
1,1970,France,192.143,72.2


In [33]:
df.iloc[[-1]]     #will return a new DataFrame with the last row / selecting the last row of a df.

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
273,2020,USA,11859.179,77.0


In [34]:
df.iloc[:,0]     #select all rows, and , 0 selects the first column (column at index 0).

0      1970
1      1970
2      1970
3      1970
4      1970
       ... 
269    2020
270    2020
271    2020
272    2020
273    2020
Name: Year, Length: 274, dtype: int64

In [35]:
df.iloc[:,-1]    #select all rows from the last column of a df.

0      70.6
1      72.2
2      71.9
3      72.0
4      70.9
       ... 
269    81.1
270    82.3
271    80.4
272    84.7
273    77.0
Name: Life_Expectancy, Length: 274, dtype: float64

In [37]:
df.iloc[0:5,:]  #used to select rows from index 0 to 4 (the first five rows) of all columns in a df.

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [38]:
df.iloc[2:10,0:2]  #select a subset of rows and columns from a df.

Unnamed: 0,Year,Country
2,1970,Great Britain
3,1970,Japan
4,1970,USA
5,1971,Canada
6,1971,Germany
7,1971,Great Britain
8,1971,Japan
9,1971,USA


In [39]:
df.head(1)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6


In [40]:
df.iloc[:,3:-1] #used to select a subset of columns from a df.

0
1
2
3
4
...
269
270
271
272
273


In [41]:
df.iloc[[1,5],[0, 3]]  #select specific rows and columns from a df.

Unnamed: 0,Year,Life_Expectancy
1,1970,72.2
5,1971,72.8


In [42]:
df[['Spending_USD','Life_Expectancy']].agg(['min','mean','max'])  #used to calculate specific summary statistics (min, mean & max) for the both columns.

Unnamed: 0,Spending_USD,Life_Expectancy
min,123.993,70.6
mean,2789.338905,77.909489
max,11859.179,84.7
