# A Simple Walk-through with Pandas for Data Science, Part 2

Complete tutorial accessible via: https://neuraspike.com/blog/a-simple-walk-through-with-pandas-for-data-science-part-2/

In [None]:
import pandas as pd
import numpy as np

### Operations with Pandas

In [None]:
number_list_1 = np.arange(5)
number_list_2 = np.arange(5,10)

print(number_list_1)
print(number_list_2)

# ------ output -------
# [0 1 2 3 4]
# [5 6 7 8 9]


data = pd.DataFrame({'col 1' : number_list_1,
                     'col 2' : number_list_2})

print(data.values)

# ------ output -------
# [[0 5]
# [1 6]
# [2 7]
# [3 8]
# [4 9]]

In [None]:
data['col 3'] = data['col 1'] + data['col 2']
print(data)

# --------- output -----------
#    col 1  col 2  col 3
# 0      0      5      5
# 1      1      6      7
# 2      2      7      9
# 3      3      8     11
# 4      4      9     13

### Handling Missing Data

In [None]:
data = np.array([200, np.nan, 210, 215, None, 220, 225])
print(data)

# --------- output -----------
# [200 nan 210 215 None 220 225]

In [None]:
print(data.sum())

# --------- output -----------
# TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

print(data.min())

# --------- output -----------
# TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

print(data.max())

# --------- output -----------
# nan

In [None]:
data = pd.Series(data)
print(data.isnull())

# --------- output -----------
# a    False
# b     True
# c    False
# d    False
# e     True
# f    False
# g    False
# dtype: bool

In [None]:
print(data[data.notnull()])


# --------- output -----------
# 0    200
# 2    210
# 3    215
# 5    220
# 6    225
# dtype: bool

In [None]:
print(data.dropna())

# --------- output -----------
# 0    200
# 2    210
# 3    215
# 5    220
# 6    225
# dtype: bool

In [None]:
data.fillna(data.mean())

# ------ output -------
# 0    200.0
# 1    205.0
# 2    210.0
# 3    215.0
# 4    214.0
# 5    220.0
# 6    225.0
# dtype: float64


data.fillna(method='ffill')

# ------ output -------
# 0    200.0
# 1    200.0
# 2    210.0
# 3    215.0
# 4    215.0
# 5    220.0
# 6    225.0
# dtype: float64


data.fillna(method='bfill')

# ------ output -------
# 0    200.0
# 1    210.0
# 2    210.0
# 3    215.0
# 4    220.0
# 5    220.0
# 6    225.0
# dtype: float64

### Combining Datasets

#### pd.Concat

In [None]:
a_list = [[2, 4, 6]]
b_list = [[1, 3, 5]]

print(np.concatenate([a_list, b_list], axis=0))

# ------ output -------
# array([[2, 4, 6],
#       [1, 3, 5]])


print(np.concatenate([a_list, b_list], axis=1))

# ------ output -------
# [[2 4 6 1 3 5]]

In [None]:
country = pd.Series(data=['Nigeria', 'Serbia', 'China', 'USA'])

population = pd.Series(data=[195900000, 6964000, 1393000000, 328200000])


print(pd.concat([country, population], axis=0))

# ------ output -------
# 0       Nigeria
# 1        Serbia
# 2         China
# 3           USA
# 0     195900000
# 1       6964000
# 2    1393000000
# 3     328200000
# dtype: object

pd.concat([country, population], axis=1)

# ------ output -------
#  	    0 	      1 
# 0 	Nigeria 	195900000
# 1 	Serbia 	  6964000
# 2 	China 	  1393000000
# 3 	USA 	    328200000

#### pd.Merge

In [None]:
df1 = pd.DataFrame({
            'country' : ['Nigeria', 'Serbia', 'China', 'USA'],
            'continent' : ['Africa', 'Europe', 'Asia', 'North America']})

df2 = pd.DataFrame({
            'country' : ['Nigeria', 'Serbia', 'China', 'USA'],
            'language' : ['English', 'Serbian', 'Mandarin', 'English']})

df3 = pd.merge(df1, df2)
print(df3)

# ------------- output --------------
#     country      continent      language
# 0   Nigeria         Africa      English
# 1   Serbia         Europe       Serbian
# 2    China           Asia       Mandarin
# 3      USA     North America    English

### Grouping Function

In [None]:
data = pd.DataFrame({
  'cars' : ['BMW', 'BMW', 'Ferrari', 'Ferrari', 'Lamborghini', 'Lamborghini'],
  'model' : ['M5', 'i8', '488 GTB', '488 Pista Spider', 'Huracan', 'Urus'],
  'price' : [102700, 147500, 262647, 350000, 203674, 203995]
})

print(data)

# ------------- output --------------
#           cars             model   price
# 0          BMW                M5  102700
# 1          BMW                i8  147500
# 2      Ferrari           488 GTB  262647
# 3      Ferrari  488 Pista Spider  350000
# 4  Lamborghini           Huracan  203674
# 5  Lamborghini            Urus    203995

In [None]:
print(data.groupby('cars'))

# ------------- output --------------
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb249814470>

In [None]:
print(data.groupby('cars').groups)

# ------------- output --------------
# {'BMW': [0, 1], 'Ferrari': [2, 3], 'Lamborghini': [4, 5]}

#### Aggregate

In [None]:
print(data.groupby('cars').mean())

# ------------- output --------------
#                
# cars         price        
# BMW          125100.0
# Ferrari      306323.5
# Lamborghini  203834.5

In [None]:
# apply groupby on one column
print(data.groupby('cars')['price'].mean())

# ------------- output --------------
# cars
# BMW            125100.0
# Ferrari        306323.5
# Lamborghini    203834.5
# Name: price, dtype: float64

In [None]:
print(data.groupby('cars').aggregate(['min', np.median, np.mean, max]))

# ---------------- output -------------------
#               price                            
#                 min    median      mean     max
# cars                                           
# BMW          102700  125100.0  125100.0  147500
# Ferrari      262647  306323.5  306323.5  350000
# Lamborghini  203674  203834.5  203834.5  203995

#### Filter

In [None]:
def filter_by_price(x):
    return x['price'].mean() > 250000

print(data.groupby('cars').filter(filter_by_price))

# ------------- output --------------
#       cars             model   price
# 2  Ferrari           488 GTB  262647
# 3  Ferrari  488 Pista Spider  350000

#### Transform

In [None]:
print(data.groupby('cars').transform(lambda x: x - 20000))

# ------------- output --------------
#     price
# 0   82700
# 1  127500
# 2  242647
# 3  330000
# 4  183674
# 5  183995

#### Apply

In [None]:
def norm_by_price(x):
    x['price'] /= x['price'].sum()
    return x


print(data.groupby('cars').apply(norm_by_price))

# ------------- output --------------
#           cars             model     price
# 0          BMW                M5  0.410472
# 1          BMW                i8  0.589528
# 2      Ferrari           488 GTB  0.428709
# 3      Ferrari  488 Pista Spider  0.571291
# 4  Lamborghini           Huracan  0.499606
# 5  Lamborghini              Urus  0.500394