## Installing Pandas Library

In [2]:
pip install pandas





[notice] A new release of pip available: 22.3 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import the module

In [3]:
import pandas as pd

print(pd.__version__)

2.0.2


#  Series

##  Series Creation

In [4]:
import pandas as pd
series1=pd.Series([35.467,63.951,80.940,60.665,127.067,64.511,318.523])
series1

0     35.467
1     63.951
2     80.940
3     60.665
4    127.067
5     64.511
6    318.523
dtype: float64

In [5]:
l1=[10,"hi",30.5,True,50]
new_series1=pd.Series(l1)
new_series1

0      10
1      hi
2    30.5
3    True
4      50
dtype: object

In [6]:
dict_series=pd.Series({'Canada':35.4,'France':63.9,'Italy':53.9})
dict_series

Canada    35.4
France    63.9
Italy     53.9
dtype: float64

### Index

In [7]:
series2=pd.Series([35.467,63.951,80.940,60.665,127.067,64.511,318.523],index=["Canada","Italy","USA","UK","France","Germany","Japan"])
series2

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
dtype: float64

In [8]:
data=[10,20,30,40,50]
index_labels=['A','B','C','D','E']
series3=pd.Series(data,index=index_labels)
series3

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [9]:
series3.index=[1,2,3,4,5]
series3

1    10
2    20
3    30
4    40
5    50
dtype: int64

### Name

In [10]:
series2.name="Pandas Series"
series2

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64

## Indexing and Slicing

### Accessing a single element by label or index:

In [11]:
series2

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64

In [12]:
series2[1]

63.951

In [13]:
series2['Canada']

35.467

### Slicing multiple elements by label or index:

In [14]:
print(series2[1:4])  #Exclusive

Italy    63.951
USA      80.940
UK       60.665
Name: Pandas Series, dtype: float64


In [15]:
print(series2['UK':'Japan']) #Inclusive

UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64


### Boolean indexing using a conditional expression: OR Conditional Filtering

In [16]:
print(series2[series2>50])

Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64


### Using loc -  ( Label based indexing )

In [17]:
print(series2.loc['USA']) #Access single element by label

80.94


In [18]:
print(series2.loc[['USA','Japan']]) #Access multiple elements by label

USA       80.940
Japan    318.523
Name: Pandas Series, dtype: float64


In [19]:
print(series2.loc['USA':'Germany']) #Slice elements by label (Inclusive)

USA         80.940
UK          60.665
France     127.067
Germany     64.511
Name: Pandas Series, dtype: float64


### Using iloc -  (Integer based indexing)

In [20]:
print(series2.iloc[2]) #Access single element by index

80.94


In [21]:
print(series2.iloc[[0,2]]) #Access multiple element by indices

Canada    35.467
USA       80.940
Name: Pandas Series, dtype: float64


In [22]:
print(series2.iloc[1:4])   #Slice elements by indices (Exclusive)

Italy    63.951
USA      80.940
UK       60.665
Name: Pandas Series, dtype: float64


## Mathematical Operations with Series

In [23]:
import pandas as pd
math_series1=pd.Series([10,20,30])
math_series2=pd.Series([1,2,3])

In [24]:
#Addition
print(math_series1+math_series2)

0    11
1    22
2    33
dtype: int64


In [25]:
#Subtraction
print(math_series1-math_series2)

0     9
1    18
2    27
dtype: int64


In [26]:
#Multiplication
print(math_series1*math_series2)

0    10
1    40
2    90
dtype: int64


In [27]:
#Division
print(math_series1/math_series2)

0    10.0
1    10.0
2    10.0
dtype: float64


In [28]:
#Exponentiation
print(math_series1**2)

0    100
1    400
2    900
dtype: int64


## Aggregate Functions

In [29]:
series2

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64

In [30]:
#Sum
print(series2.sum())

751.124


In [31]:
#Mean
print(series2.mean())

107.30342857142857


In [32]:
#Median
print(series2.median())

64.511


In [33]:
#Max
print(series2.max())

318.523


In [34]:
#Min
print(series2.min())

35.467


In [35]:
#Standard Deviation
print(series2.std())

97.25017306900992


In [36]:
#Variance
print(series2.var())

9457.596161952382


In [37]:
series2>70

Canada     False
Italy      False
USA         True
UK         False
France      True
Germany    False
Japan       True
Name: Pandas Series, dtype: bool

In [38]:
#Conditional Filtering
series2[series2>70]

USA        80.940
France    127.067
Japan     318.523
Name: Pandas Series, dtype: float64

## Modification

In [39]:
mod_series1=pd.Series([35.467,63.951,80.940,60.665,127.067,64.511,318.523],index=["Canada","Italy","USA","UK","France","Germany","Japan"])
mod_series1

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
dtype: float64

In [40]:
#Using index
mod_series1['canada']=55.6
mod_series1

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
canada      55.600
dtype: float64

In [41]:
#Using boolean indexing
mod_series2=pd.Series([35.467,63.951,80.940,60.665,127.067,64.511,318.523],index=["Canada","Italy","USA","UK","France","Germany","Japan"])
mod_series2

Canada      35.467
Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
dtype: float64

In [42]:
mod_series2[mod_series2>70]=0
mod_series2

Canada     35.467
Italy      63.951
USA         0.000
UK         60.665
France      0.000
Germany    64.511
Japan       0.000
dtype: float64

In [43]:
#Using a function
mod_series3=pd.Series([10,20,30,40,50])
mod_series3

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [44]:
mod_series3=mod_series3.apply(lambda x:x*2)
mod_series3

0     20
1     40
2     60
3     80
4    100
dtype: int64

In [45]:
#Using replace()

In [46]:
mod_series4=pd.Series([10,20,30,40,50])
mod_series4

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [47]:
mod_series4=mod_series4.replace({20:25,30:35})
mod_series4

0    10
1    25
2    35
3    40
4    50
dtype: int64

## Logical operations

In [48]:
lg_series=pd.Series([10,20,30,40,50])
lg_series

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [49]:
#AND
result_and=lg_series[(lg_series>20)&(lg_series<40)]
result_and

2    30
dtype: int64

In [50]:
#OR
result_or=lg_series[(lg_series<20)|(lg_series>40)]
result_or

0    10
4    50
dtype: int64

In [51]:
#NOT
result_not=lg_series[~(lg_series>30)]
result_not

0    10
1    20
2    30
dtype: int64

In [52]:
#Evaluating an expression
print(series2[(series2>series2.mean()-series2.std()/2)|(series2>series2.mean()+series2.std()/2)])

Italy       63.951
USA         80.940
UK          60.665
France     127.067
Germany     64.511
Japan      318.523
Name: Pandas Series, dtype: float64


# Data Frame

## Creating a Data Frame

In [53]:
#Creating a data frame using dictionary
import pandas as pd
data1={
    'Name':['John','Jane','Mike','Sara'],
    'Age':[25,30,35,28],
    'City':['New York','London','Paris','Sidney']
}
df1=pd.DataFrame(data1)
df1

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Sara,28,Sidney


In [54]:
#Creating a data frame from a list of dictionaries
data2=[{'Name':'John','Age':30},{'Name':'Jane','Age':35},{'Name':'Mike','Age':28}]
df2=pd.DataFrame(data2)
df2

Unnamed: 0,Name,Age
0,John,30
1,Jane,35
2,Mike,28


In [55]:
#Creating a data frame from a numpy array
import pandas as pd
import numpy as np
data3=np.array([[1,2,3],[4,5,6],[7,8,9]])
columns=['A','B','C']
df3=pd.DataFrame(data3,columns=columns)
df3

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [56]:
# Giving indices to dataframe
import pandas as pd
import numpy as np
df4=pd.DataFrame(np.array([[1,2,3],[4,5,6],[7,8,9]]),index=['I','II','III'],columns=['A','B','C'])
df4

Unnamed: 0,A,B,C
I,1,2,3
II,4,5,6
III,7,8,9


In [57]:
import pandas as pd
df5 = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [1785387,2833687,3874437,2167744,4602367,2950039,17348075],
    'Surface Area': [9984670,640679,357114,301336,377930,242495,9525067],
    'HDI': [0.913,0.888,0.916,0.873,0.891,0.907,0.915],
    'Continent': ['America','Europe','Europe','Europe','Asia','Europe','America']
}) 
    
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [58]:
df5.index = ['Canada','France','Germany','Italy','Japan','United Kingdom','United States',]

In [59]:
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


## Indexing and Slicing

In [60]:
#Accessing row using label
df5.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [61]:
df5.loc['Italy']

Population       60.665
GDP             2167744
Surface Area     301336
HDI               0.873
Continent        Europe
Name: Italy, dtype: object

In [62]:
#Accessing row using index
df5.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [63]:
df5.loc[-1]

KeyError: -1

In [64]:
df5.iloc['Canada']

TypeError: Cannot index by location index with a non-integer key

In [66]:
df5['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [67]:
#Accessing multiple columns
df5[['Population','GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [68]:
#Slicing
df5[1:3]  #OR df5.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [69]:
df5['France':'Italy']  #Or df5.loc['France':'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [70]:
df5.loc['France':'Italy','Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [71]:
df5.loc['France':'Italy',['Population','GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [72]:
df5.iloc[[0,1,-1,-2]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America
United Kingdom,64.511,2950039,242495,0.907,Europe


## Conditional Selection (Boolean array)

In [73]:
df5['Population']>70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [74]:
df5[df5['Population']>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [75]:
df5.loc[df5['Population']>70,'Population']

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

In [76]:
df5.loc[df5['Population']>70,['Population','GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


# Data Exploration and Cleaning

## Examining Dataframe structure

In [77]:
#To display column names
df5.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [78]:
#To display index
df5.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [79]:
#To print summary of data frame
df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 636.0+ bytes


In [80]:
#Example of data frame containing null values
data3=[{'Name':'John','Age':30},{'Name':'Jane','Age':35},{'name':'Mike','Age':28}]
df6=pd.DataFrame(data3)
df6

Unnamed: 0,Name,Age,name
0,John,30,
1,Jane,35,
2,,28,Mike


In [81]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    2 non-null      object
 1   Age     3 non-null      int64 
 2   name    1 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [82]:
#Example of data frame containing null values
import numpy as np
df7=pd.DataFrame([[1,np.nan,2],[np.nan,5,6]])
df7

Unnamed: 0,0,1,2
0,1.0,,2
1,,5.0,6


In [83]:
#To print dimension
df5.shape

(7, 5)

In [84]:
df5.size

35

In [85]:
df5.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [86]:
#Explore basic statistical summaries
df5.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [87]:
df5.describe(include='all')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
count,7.0,7.0,7.0,7.0,7
unique,,,,,3
top,,,,,Europe
freq,,,,,4
mean,107.302571,5080248.0,3061327.0,0.900429,
std,97.24997,5494020.0,4576187.0,0.016592,
min,35.467,1785387.0,242495.0,0.873,
25%,62.308,2500716.0,329225.0,0.8895,
50%,64.511,2950039.0,377930.0,0.907,
75%,104.0005,4238402.0,5082873.0,0.914,


In [88]:
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [89]:
df5.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia


In [90]:
df5.head(3)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [91]:
df5.tail()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [92]:
df5.tail(2)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [93]:
df5['Continent'].value_counts()

Continent
Europe     4
America    2
Asia       1
Name: count, dtype: int64

In [94]:
df5['Population'].value_counts()

Population
35.467     1
63.951     1
80.940     1
60.665     1
127.061    1
64.511     1
318.523    1
Name: count, dtype: int64

In [95]:
#Sorting
df5.sort_values(by='Population') #Ascending order

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
Italy,60.665,2167744,301336,0.873,Europe
France,63.951,2833687,640679,0.888,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [96]:
df5.sort_values(by='Population',ascending=False) #Descending order

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
United States,318.523,17348075,9525067,0.915,America
Japan,127.061,4602367,377930,0.891,Asia
Germany,80.94,3874437,357114,0.916,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
France,63.951,2833687,640679,0.888,Europe
Italy,60.665,2167744,301336,0.873,Europe
Canada,35.467,1785387,9984670,0.913,America


In [97]:
#Dropping single row
df5.drop('Canada') #Won't reflect in df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [98]:
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [99]:
drop_df5=df5.drop('Canada')

In [100]:
drop_df5 #Change reflected

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [101]:
#inplace
df5.drop('Canada',inplace=True)

In [102]:
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [103]:
#Dropping multiple rows
df5.drop(['Italy','Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [104]:
df5.drop(['Italy','Japan'],axis=0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [105]:
#Dropping single column
df5.drop(columns='Population')

Unnamed: 0,GDP,Surface Area,HDI,Continent
France,2833687,640679,0.888,Europe
Germany,3874437,357114,0.916,Europe
Italy,2167744,301336,0.873,Europe
Japan,4602367,377930,0.891,Asia
United Kingdom,2950039,242495,0.907,Europe
United States,17348075,9525067,0.915,America


In [106]:
#Dropping multiple column
df5.drop(columns=['Population','GDP'])

Unnamed: 0,Surface Area,HDI,Continent
France,640679,0.888,Europe
Germany,357114,0.916,Europe
Italy,301336,0.873,Europe
Japan,377930,0.891,Asia
United Kingdom,242495,0.907,Europe
United States,9525067,0.915,America


In [107]:
df5.drop(columns=['Population','HDI'],axis=1)

Unnamed: 0,GDP,Surface Area,Continent
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [108]:
df5.drop(columns=['Population','HDI'],axis='columns')

Unnamed: 0,GDP,Surface Area,Continent
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


## Handling missing data

In [109]:
df7=pd.DataFrame(np.array([['John',25, 5000],['Alice',None,7000],['Bob',18,None],[None,40,6000]]),columns=['Name','Age','Salary'])
df7

Unnamed: 0,Name,Age,Salary
0,John,25.0,5000.0
1,Alice,,7000.0
2,Bob,18.0,
3,,40.0,6000.0


In [110]:
#isnull()
print(df7.isnull())  

    Name    Age  Salary
0  False  False   False
1  False   True   False
2  False  False    True
3   True  False   False


In [111]:
#notnull()
print(df7.notnull()) 

    Name    Age  Salary
0   True   True    True
1   True  False    True
2   True   True   False
3  False   True    True


In [112]:
#drop null
df7.dropna() 

Unnamed: 0,Name,Age,Salary
0,John,25,5000


In [113]:
#fill null
df7.fillna('Unknown')

Unnamed: 0,Name,Age,Salary
0,John,25,5000
1,Alice,Unknown,7000
2,Bob,18,Unknown
3,Unknown,40,6000


In [114]:
df7.fillna(value={'Name':'X','Age':20,'Salary':5000})

Unnamed: 0,Name,Age,Salary
0,John,25,5000
1,Alice,20,7000
2,Bob,18,5000
3,X,40,6000


### Creating custom columns based on existing columns
Altering a dataframe often involves combining different columns into another. 

In [115]:
df5[['Population','GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [116]:
df5[['Population','GDP']]/100 #Each value divided by 100

Unnamed: 0,Population,GDP
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [117]:
#Creating a new series 
crisis=pd.Series([-1_000_000,-3],index=['GDP','HDI'])
crisis

GDP   -1000000
HDI         -3
dtype: int64

In [118]:
df5[['GDP','HDI']]

Unnamed: 0,GDP,HDI
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [119]:
#Values of GDP and HDI are updated with new values
df5[['GDP','HDI']]+crisis

Unnamed: 0,GDP,HDI
France,1833687,-2.112
Germany,2874437,-2.084
Italy,1167744,-2.127
Japan,3602367,-2.109
United Kingdom,1950039,-2.093
United States,16348075,-2.085


In [135]:
df5[['Population','GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687.0
Germany,80.94,3874437.0
Italy,60.665,2167744.0
Japan,127.061,4602367.0
United Kingdom,64.511,2950039.0
United States,318.523,17348075.0
china,1400000000.0,


In [138]:
#Calculating GDP per capita=GDP/Population
df5['Population']/df5['GDP']

France            0.000023
Germany           0.000021
Italy             0.000028
Japan             0.000028
United Kingdom    0.000022
United States     0.000018
china                  NaN
dtype: float64

### Adding new column

In [142]:
#Adding new column - GDP per capita
df5['GDP per Capita']=df5['GDP']/df5['Population']
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capita
France,63.951,2833687.0,640679.0,0.888,Europe,English,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,English,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,English,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,English,36221.712406
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,English,45729.239975
United States,318.523,17348075.0,9525067.0,0.915,America,English,54464.12033
china,1400000000.0,,,,Asia,,


In [120]:
langs=pd.Series(['French','German','Italian'],
               index=['France','Germany','Italy'],
               name='Language')
langs

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [121]:
#Adding new column
df5['Language']=langs
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


### Replacing values per column

In [122]:
df5['Language']=df5['Language'].replace(['French'],'English')
#occurences of french replaced to english
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [123]:
df5['Language']='English'
#all values are replaced as english
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


### Renaming columns

In [124]:
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


In [125]:
df5.rename(columns={'HDI':'Human Development Index','Anual Popcorn Consumption':'APC'},
          index={'United States':'USA','United Kingdom':'UK','Argentina':'AR'})

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [126]:
#To convert indices to uppercase
df5.rename(index=str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


In [127]:
df5.rename(index=str.lower)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
united kingdom,64.511,2950039,242495,0.907,Europe,English
united states,318.523,17348075,9525067,0.915,America,English


In [128]:
df5.rename(columns=lambda x:x.lower())

Unnamed: 0,population,gdp,surface area,hdi,continent,language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


### Updating data in dataframe

In [130]:
df5.loc['china']=[3,5,0,0,0,0]
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English
china,3.0,5,0,0.0,0,0


In [133]:
#Updating china
df5.loc['china']=pd.Series({'Population':1_400_000_000,'Continent':'Asia'})
df5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
France,63.951,2833687.0,640679.0,0.888,Europe,English
Germany,80.94,3874437.0,357114.0,0.916,Europe,English
Italy,60.665,2167744.0,301336.0,0.873,Europe,English
Japan,127.061,4602367.0,377930.0,0.891,Asia,English
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,English
United States,318.523,17348075.0,9525067.0,0.915,America,English
china,1400000000.0,,,,Asia,


## Statistical Info for more data analysis
You've already seen the describe method, which gives you a good "summary" of the DataFrame. Let's explore other methods in more detail: 

In [144]:
population=df5['Population']

In [145]:
population.min(),population.max()

(60.665, 1400000000.0)

In [146]:
population.sum()

1400000715.651

In [147]:
population.sum()/len(population)

200000102.23585716

In [148]:
population.mean()

200000102.23585716

In [149]:
population.median()

80.94

In [150]:
population.mode()

0    6.066500e+01
1    6.395100e+01
2    6.451100e+01
3    8.094000e+01
4    1.270610e+02
5    3.185230e+02
6    1.400000e+09
Name: Population, dtype: float64

In [151]:
population.std()

529150217.13115054

In [152]:
population.var()

2.7999995228994378e+17

In [153]:
(2.7999995228994378e+17)**.5 #std=√var

529150217.13115054

In [154]:
population.quantile(.25)

64.231

In [156]:
population.quantile([.2,.4,.6,.8,1])

0.2    6.406300e+01
0.4    7.108260e+01
0.6    1.086126e+02
0.8    2.802306e+02
1.0    1.400000e+09
Name: Population, dtype: float64

## Handling different file formats
In pandas, you can convert data between different formats using various methods and functions provided by the library.

In [157]:
#DataFrame to csv format
df5.to_csv('sample_file.csv')

In [158]:
s1=pd.read_csv('sample_file.csv')

In [159]:
s1.head()

Unnamed: 0.1,Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capita
0,France,63.951,2833687.0,640679.0,0.888,Europe,English,44310.284437
1,Germany,80.94,3874437.0,357114.0,0.916,Europe,English,47868.013343
2,Italy,60.665,2167744.0,301336.0,0.873,Europe,English,35733.025633
3,Japan,127.061,4602367.0,377930.0,0.891,Asia,English,36221.712406
4,United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,English,45729.239975
