In [1]:
#https://github.com/Nyandwi/machine_learning_complete/blob/main/2_data_manipulation_with_pandas/2_data_manipulation_with_pandas.ipynb

In [2]:
"""
Both series and DataFrames are Pandas Data structures.

Series is like one dimensional NumPy array with axis labels.

DataFrame is multidimensional NumPy array with labels on rows and columns.

Working with NumPy, we saw that it supports numeric type data. Pandas on other hand supports whole range of data types, from numeric to strings, etc..

Since we are using python notebook, we do not need to install Pandas. We only just have to import it.
"""

'\nBoth series and DataFrames are Pandas Data structures.\n\nSeries is like one dimensional NumPy array with axis labels.\n\nDataFrame is multidimensional NumPy array with labels on rows and columns.\n\nWorking with NumPy, we saw that it supports numeric type data. Pandas on other hand supports whole range of data types, from numeric to strings, etc..\n\nSince we are using python notebook, we do not need to install Pandas. We only just have to import it.\n'

In [3]:
import numpy as np
import pandas as pd

In [4]:
# Creating series from python list
num_list = [1,2,3,4,5]
pd.Series(num_list)

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
week_days = ['Mon','Tues','Wed','Thurs','Fri']
pd.Series(week_days, index=['a','b','c','d','e'])

a      Mon
b     Tues
c      Wed
d    Thurs
e      Fri
dtype: object

In [6]:
#Creating the Series from dictionary
countries_code = { 1:'US', 91:'India', 86:'China'}
pd.Series(countries_code)

1        US
91    India
86    China
dtype: object

In [7]:
d = {1:'a',2:'d',3:'c'}
pd.Series(d)

1    a
2    d
3    c
dtype: object

In [8]:
# Creating the Series from NumPy array
# We peovide the list of indexes
# if we don't provide the indexes, the default indexes are numbers...starts from 0,1,2..
arr = np.array([1,2,3,4,5])
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [9]:
pd.Series(arr, index=['a','b','c','d','e'])

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [10]:
"""
Creating DataFrames
DataFrames are the most used Pandas data structure. It can be created from a dictionary, 2D array, and Series.
"""

'\nCreating DataFrames\nDataFrames are the most used Pandas data structure. It can be created from a dictionary, 2D array, and Series.\n'

In [11]:
# Creating DataFrame from a dictionary

countries = {'Name':['USA','INDIA','CHINA'],
            'Codes':[1,91,36]}
pd.DataFrame(countries)

Unnamed: 0,Name,Codes
0,USA,1
1,INDIA,91
2,CHINA,36


In [12]:
# Creating a dataframe from a 2D array
# You pass the list of columns

array_2d = np.array([[1,2,3], [4,5,6], [7,8,9]])
pd.DataFrame(array_2d, columns = ['cols_1','cols_2','cols_3'])

Unnamed: 0,cols_1,cols_2,cols_3
0,1,2,3
1,4,5,6
2,7,8,9


In [13]:
# Creating a dataframe from Pandas series 
# Pass the columns in a list
countries_code = { 1:'US', 91:'India', 86:'China'}
pd.Series(countries_code)
pd_series = pd.Series(countries_code)

df = pd.DataFrame(pd_series, columns=['Codes'])
df

Unnamed: 0,Codes
1,US
91,India
86,China


In [14]:
# Adding a column
# Number in population are pretty random

df['Population'] = [100, 1000, 1250]
df

Unnamed: 0,Codes,Population
1,US,100
91,India,1000
86,China,1250


In [15]:
# Removing a column 
df.drop('Population', axis=1)

Unnamed: 0,Codes
1,US
91,India
86,China


In [16]:
df.columns

Index(['Codes', 'Population'], dtype='object')

In [17]:
df.keys

<bound method NDFrame.keys of     Codes  Population
1      US         100
91  India        1000
86  China        1250>

In [18]:
df.index

Int64Index([1, 91, 86], dtype='int64')

In [19]:
"""
B. Data Indexing and Selection
Indexing and selection works in both Series and Dataframe.

Because DataFrame is made of Series, let's focus on how to select data in DataFrame.
"""

"\nB. Data Indexing and Selection\nIndexing and selection works in both Series and Dataframe.\n\nBecause DataFrame is made of Series, let's focus on how to select data in DataFrame.\n"

In [20]:
# Creating DataFrame from a dictionary

countries = {'Name':['USA','INDIA','Germany','Rwanda'],
            'Codes':[1,91,49,250]}
df = pd.DataFrame(countries, index=['a','b','c','d'])
df

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91
c,Germany,49
d,Rwanda,250


In [21]:
df['Name']

a        USA
b      INDIA
c    Germany
d     Rwanda
Name: Name, dtype: object

In [22]:
df.Name

a        USA
b      INDIA
c    Germany
d     Rwanda
Name: Name, dtype: object

In [23]:
df['Codes']

a      1
b     91
c     49
d    250
Name: Codes, dtype: int64

In [24]:
## When you have many columns, columns in list will be selected

df[['Name','Codes']]

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91
c,Germany,49
d,Rwanda,250


In [25]:
# This will return the first two rows
df [0:2]

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91


In [26]:
'You can also use loc to select data by the label indexes and iloc to select by default integer index (or by the position of the row)'

'You can also use loc to select data by the label indexes and iloc to select by default integer index (or by the position of the row)'

In [27]:
df.loc['a']

Name     USA
Codes      1
Name: a, dtype: object

In [28]:
df.loc['b':'d']

Unnamed: 0,Name,Codes
b,INDIA,91
c,Germany,49
d,Rwanda,250


In [29]:
df[:'b']

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91


In [30]:
df.iloc[2]

Name     Germany
Codes         49
Name: c, dtype: object

In [31]:
df.iloc[1:3]

Unnamed: 0,Name,Codes
b,INDIA,91
c,Germany,49


In [32]:
df.iloc[2:]

Unnamed: 0,Name,Codes
c,Germany,49
d,Rwanda,250


In [33]:
#Conditional Selection

In [34]:
df

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91
c,Germany,49
d,Rwanda,250


In [35]:
#Let's select a country with code 49

df[df["Codes"] == 91]

Unnamed: 0,Name,Codes
b,INDIA,91


In [36]:
df[df['Codes'] < 250]

Unnamed: 0,Name,Codes
a,USA,1
b,INDIA,91
c,Germany,49


In [37]:
df[df['Name'] == 'USA']

Unnamed: 0,Name,Codes
a,USA,1


In [38]:
# You can use and (&) or (|) for more than conditions
#df [(condition 1) & (condition 2)]

df[(df['Codes'] == 91) & 
   (df['Name'] == 'INDIA')]

Unnamed: 0,Name,Codes
b,INDIA,91


In [39]:
'You can also use isin() and where() to select data in a series or dataframe.'

'You can also use isin() and where() to select data in a series or dataframe.'

In [40]:
# isin() return false or true when provided value is included in dataframe

sample_codes_names=[1,3,250,'USA','INDIA','India']
df.isin(sample_codes_names)

Unnamed: 0,Name,Codes
a,True,True
b,True,False
c,False,False
d,False,True


In [41]:
'As you can see, it returned True wherever a country code or name was found. Otherwise, False. You can use a dictinary to match search by columns. A key must be a column and values are passed in list.'

'As you can see, it returned True wherever a country code or name was found. Otherwise, False. You can use a dictinary to match search by columns. A key must be a column and values are passed in list.'

In [42]:
sample_codes_name={'Codes':[1,3,250],
                  'Name':['USA','INDIA','India']}
df.isin(sample_codes_name)

Unnamed: 0,Name,Codes
a,True,True
b,True,False
c,False,False
d,False,True


In [43]:
df2 = pd.DataFrame(np.array([[1,2,3],[4,5,6],
                            [7,8,9]]),
    columns = ['cols_1','cols_2','cols_3'])

df2

Unnamed: 0,cols_1,cols_2,cols_3
0,1,2,3
1,4,5,6
2,7,8,9


In [44]:
df2.isin([0,3,4,5,7])

Unnamed: 0,cols_1,cols_2,cols_3
0,False,False,True
1,True,True,False
2,True,False,False


In [45]:
df2[df2>4]

Unnamed: 0,cols_1,cols_2,cols_3
0,,,
1,,5.0,6.0
2,7.0,8.0,9.0


In [46]:
df2.where(df2>4)

Unnamed: 0,cols_1,cols_2,cols_3
0,,,
1,,5.0,6.0
2,7.0,8.0,9.0


In [47]:
'Where the condition is false, where allows you to replace values. In this case, all values less than 4 will be 0.'

'Where the condition is false, where allows you to replace values. In this case, all values less than 4 will be 0.'

In [48]:
df2.where(df2>4,0)

Unnamed: 0,cols_1,cols_2,cols_3
0,0,0,0
1,0,5,6
2,7,8,9


In [49]:
df2[df2>4]=0
df2

Unnamed: 0,cols_1,cols_2,cols_3
0,1,2,3
1,4,0,0
2,0,0,0


In [50]:
#'df.items() #Iterate over (column name, Series) pairs.
#'df.iteritems() Iterate over (column name, Series) pairs.
#DataFrame.iterrows() Iterate over DataFrame rows as (index, Series) pairs.
#DataFrame.itertuples([index, name]) Iterate over DataFrame rows as namedtuples.

In [52]:
# Iterate over (column name, Series) pairs.

for col_name, content in df2.items():
    print(col_name)
    print(content)

cols_1
0    1
1    4
2    0
Name: cols_1, dtype: int64
cols_2
0    2
1    0
2    0
Name: cols_2, dtype: int64
cols_3
0    3
1    0
2    0
Name: cols_3, dtype: int64


In [54]:
# Iterate over (column name, Series) pairs.
# Same as df.items()

for col_name, content in df2.iteritems():
    print(col_name)
    print(content)

cols_1
0    1
1    4
2    0
Name: cols_1, dtype: int64
cols_2
0    2
1    0
2    0
Name: cols_2, dtype: int64
cols_3
0    3
1    0
2    0
Name: cols_3, dtype: int64


In [55]:
# Iterate over DataFrame rows as (index, Series) pairs

for row in df2.iterrows():
    print(row)

(0, cols_1    1
cols_2    2
cols_3    3
Name: 0, dtype: int64)
(1, cols_1    4
cols_2    0
cols_3    0
Name: 1, dtype: int64)
(2, cols_1    0
cols_2    0
cols_3    0
Name: 2, dtype: int64)


In [56]:
# Iterate over DataFrame rows as namedtuples

for row in df2.itertuples():
    print(row)

Pandas(Index=0, cols_1=1, cols_2=2, cols_3=3)
Pandas(Index=1, cols_1=4, cols_2=0, cols_3=0)
Pandas(Index=2, cols_1=0, cols_2=0, cols_3=0)


C. Dealing with Missing data
Real world datasets are messy, often with missing values. Pandas replace NaN with missing values by default. NaN stands for not a number.

Missing values can either be ignored, droped or filled.

In [57]:
# Creating a dataframe

df3 = pd.DataFrame(np.array([[1,2,3],[4,np.nan
,6],[7,np.nan,np.nan]]), columns = ['cols_1',
        'cols_2','cols_3'])

In [58]:
# Recognizing the missing values
df3.isnull()

Unnamed: 0,cols_1,cols_2,cols_3
0,False,False,False
1,False,True,False
2,False,True,True


In [61]:
# Calculating number of the missing values in each feature

df3.isnull().sum()

cols_1    0
cols_2    2
cols_3    1
dtype: int64

In [62]:
# Recognizng non missig values

df3.notna()

Unnamed: 0,cols_1,cols_2,cols_3
0,True,True,True
1,True,False,True
2,True,False,False


In [63]:
df3.notna().sum()

cols_1    3
cols_2    1
cols_3    2
dtype: int64

Removing the missing values

In [64]:
## Dropping missing values 
df3.dropna()

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0


All rows are deleted because dropna() will remove each row which have missing value.

In [65]:
# you can drop NaNs in specific column(s)

df3['cols_3'].dropna()

0    3.0
1    6.0
Name: cols_3, dtype: float64

In [66]:
# You can drop data by axis 
# Axis = 1...drop all columns with Nans
# df3.dropna(axis='columns')

df3.dropna(axis=1)

Unnamed: 0,cols_1
0,1.0
1,4.0
2,7.0


In [68]:
# axis = 0...drop all rows with Nans
# df3.dropna(axis='rows') is same 

df3.dropna(axis=0)

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0


Filling the missing values

In [69]:
# Filling Missing values
df3.fillna(10)

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0
1,4.0,10.0,6.0
2,7.0,10.0,10.0


In [70]:
df3.fillna('fillme')

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2,3
1,4.0,fillme,6
2,7.0,fillme,fillme


When ffill is applied across the column axis, then missing values are filled by the value in previous column in the same row.

In [71]:
df3.fillna(method='ffill')

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0
1,4.0,2.0,6.0
2,7.0,2.0,6.0


In [72]:
# Won't change it because the last values are NaNs, so it backward it

df3.fillna(method='bfill')

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0
1,4.0,,6.0
2,7.0,,


In [73]:
# If we change the axis to columns, you can see that Nans at row 2 and col 2 is backfilled with 6
df3.fillna(method='bfill',axis='columns')

Unnamed: 0,cols_1,cols_2,cols_3
0,1.0,2.0,3.0
1,4.0,6.0,6.0
2,7.0,,
