<h1>Pandas Reintro</h3>

<h3>Pandas Series</h3>

In [2]:
import pandas as pd
import numpy as np

In [3]:
labels = ['a', 'b', 'c']
mylist = [10, 20, 30]
arr = np.array(mylist)
d = {
    'a': 10,
    'b': 20,
    'c': 30
}

In [4]:
d_series = pd.Series(d)
d_series

a    10
b    20
c    30
dtype: int64

In [5]:
salesQ1 = pd.Series(data=[250, 450, 200, 150], index=['USA', 'China', 'India', 'Brazil'])
salesQ1

USA       250
China     450
India     200
Brazil    150
dtype: int64

In [6]:
salesQ2 = pd.Series(data=[210, 480, 220, 130], index=['USA', 'China', 'India', 'Japan'])
salesQ2

USA      210
China    480
India    220
Japan    130
dtype: int64

In [7]:
salesQ2['China']

480

In [8]:
salesQ1 + salesQ2

Brazil      NaN
China     930.0
India     420.0
Japan       NaN
USA       460.0
dtype: float64

<h3>Pandas Dataframe</h3>

In [2]:
import numpy as np
import pandas as pd

In [4]:
columns = ['W', 'X', 'Y', 'Z']

In [12]:
index = ['A', 'B', 'C', 'D', 'E']

In [13]:
from numpy.random import randint

In [14]:
np.random.seed(42)
data = randint(-100, 100, (5, 4))
data

array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])

In [15]:
df = pd.DataFrame(data, index, columns)
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [16]:
df['W']

A     2
B     6
C     2
D    16
E    30
Name: W, dtype: int64

In [17]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,2,-86
B,6,-80
C,2,-13
D,16,51
E,30,-99


In [18]:
df['K'] = df.W + df.Y
df

Unnamed: 0,W,X,Y,Z,K
A,2,79,-8,-86,-6
B,6,-29,88,-80,94
C,2,21,-26,-13,-24
D,16,-1,3,51,19
E,30,49,-48,-99,-18


In [21]:
df.drop('K', axis=1)

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [22]:
df.drop('K', axis=1, inplace=True)

In [23]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [25]:
df.loc['A']

W     2
X    79
Y    -8
Z   -86
Name: A, dtype: int64

In [26]:
df.loc[['C', 'A']]

Unnamed: 0,W,X,Y,Z
C,2,21,-26,-13
A,2,79,-8,-86


In [4]:
import numpy as np
import pandas as pd

In [5]:
columns = ['W', 'X', 'Y', 'Z']
index = ['A', 'B', 'C', 'D', 'E']

from numpy.random import randint

np.random.seed(42)
data = randint(-100, 100, (5, 4))
data

df = pd.DataFrame(data, index, columns)
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [6]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,True,False,True,False
C,True,True,False,False
D,True,False,True,True
E,True,True,False,False


In [8]:
df[df > 0].fillna(0)

Unnamed: 0,W,X,Y,Z
A,2,79.0,0.0,0.0
B,6,0.0,88.0,0.0
C,2,21.0,0.0,0.0
D,16,0.0,3.0,51.0
E,30,49.0,0.0,0.0


In [9]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [10]:
df.X > 0

A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [15]:
df[df.X > 0].iloc[0]

W     2
X    79
Y    -8
Z   -86
Name: A, dtype: int64

In [17]:
df[(df.W > 0) & (df.Y > 1)]

Unnamed: 0,W,X,Y,Z
B,6,-29,88,-80
D,16,-1,3,51


In [19]:
df.reset_index(drop=True)

Unnamed: 0,W,X,Y,Z
0,2,79,-8,-86
1,6,-29,88,-80
2,2,21,-26,-13
3,16,-1,3,51
4,30,49,-48,-99


In [20]:
new_ind = ['CA', 'NY', 'WY', 'OR', 'CO']

In [21]:
df['States'] = new_ind
df

Unnamed: 0,W,X,Y,Z,States
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
C,2,21,-26,-13,WY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [25]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2,79,-8,-86
NY,6,-29,88,-80
WY,2,21,-26,-13
OR,16,-1,3,51
CO,30,49,-48,-99


In [26]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,11.2,23.8,1.8,-45.4
std,11.96662,42.109381,51.915316,63.366395
min,2.0,-29.0,-48.0,-99.0
25%,2.0,-1.0,-26.0,-86.0
50%,6.0,21.0,-8.0,-80.0
75%,16.0,49.0,3.0,-13.0
max,30.0,79.0,88.0,51.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   W       5 non-null      int64 
 1   X       5 non-null      int64 
 2   Y       5 non-null      int64 
 3   Z       5 non-null      int64 
 4   States  5 non-null      object
dtypes: int64(4), object(1)
memory usage: 412.0+ bytes


In [28]:
df.dtypes

W          int64
X          int64
Y          int64
Z          int64
States    object
dtype: object

<h3>Pandas Missing Data</h3>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [10, 20, 30, 40]
})
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,,,30
3,4.0,8.0,40


In [4]:
df.dropna(axis=1, thresh=3)

Unnamed: 0,A,C
0,1.0,10
1,2.0,20
2,,30
3,4.0,40


In [5]:
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,0.0,20
2,0.0,0.0,30
3,4.0,8.0,40


In [6]:
df.A = df.A.fillna(df.A.mean())
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,2.333333,,30
3,4.0,8.0,40


In [7]:
df.B = df.B.fillna(df.B.mean())

In [8]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,6.5,20
2,2.333333,6.5,30
3,4.0,8.0,40
