
# <center> Pandas </center>

[Pandas](https://pandas.pydata.org/) is a fast, powerful, flexible and easy to use open source __data analysis and manipulation__ tool, built on top of the Python programming language.

* Functionality (similar to Spark)
* Ecosystem (better than Spark)
* __No native support for distributed cluster computing__

<hr/>

## 1. Series

* A Pandas data type
* Built on top of NumPy array object
* Have labels

In [1]:
import numpy as np
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Creating a Series from

* Python list
* NumPy array
* Python Dictionary

In [4]:
labels = ['a', 'b', 'c']

#### from Python list

In [8]:
pd.Series(['a1', 'b1', 'c1'], index=labels)

a    a1
b    b1
c    c1
dtype: object

#### from NumPy array

In [11]:
pd.Series(np.array([10, 20, 30]), index=labels)

a    10
b    20
c    30
dtype: int32

#### from Python dictionary

In [13]:
pd.Series({'a': 10, 'b': 20, 'c': 30})

a    10
b    20
c    30
dtype: int64

### Using Series

* access by index
* access by position
* operation is key based

In [16]:
ser1 = pd.Series([1, 2, 3, 4], index=['USA', 'Germany', 'Australia', 'Japan'])
ser1

USA          1
Germany      2
Australia    3
Japan        4
dtype: int64

In [17]:
ser2 = pd.Series([1, 2, 5, 4], index=['USA', 'Germany','Italy', 'Japan'])
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [19]:
ser1['USA']

1

In [20]:
ser1[2]

3

In [21]:
ser1[:3]

USA          1
Germany      2
Australia    3
dtype: int64

In [22]:
ser1 + ser2

Australia    NaN
Germany      4.0
Italy        NaN
Japan        8.0
USA          2.0
dtype: float64

<hr/>

## 2. DataFrames

* A Pandas data type, the only most important one !
* A bunch of Series objects share the same index
* Similar to Spark Dataframe, excel spreadsheet, DB table

In [23]:
np.random.normal(0, 1, (5, 4))

array([[-0.30389514, -1.30545917, -1.34957002,  0.89119546],
       [-1.5414805 ,  0.8711506 ,  1.09443596,  0.62029563],
       [-0.55477419,  1.73740534,  1.88542099,  1.48592291],
       [-0.7176363 , -0.02750905, -1.03067089,  1.04456972],
       [ 0.09017556,  1.83489488,  0.09103724,  0.69392624]])

In [25]:
df = pd.DataFrame(
    np.random.normal(0, 1, (5, 4)),
    index=['A', 'B', 'C', 'D', 'E'],
    columns=['W', 'X', 'Y', 'Z'],
)
df

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
B,-0.671143,0.733056,-0.546496,-2.145445
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886
E,2.526273,1.337585,-1.369646,0.664797


### Selection and Indexing

In [26]:
df['W']

A    0.327905
B   -0.671143
C    1.048582
D   -0.089408
E    2.526273
Name: W, dtype: float64

In [27]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,0.327905,0.261565
B,-0.671143,-2.145445
C,1.048582,-1.896903
D,-0.089408,-0.772886
E,2.526273,0.664797


In [28]:
# SQL Syntax (NOT RECOMMENDED!)
df.W

A    0.327905
B   -0.671143
C    1.048582
D   -0.089408
E    2.526273
Name: W, dtype: float64

### DataFrame Columns are just Series

In [29]:
type(df['W'])

pandas.core.series.Series

### Creating a new column

In [30]:
df['New'] = df['W'] + df['Z']
df

Unnamed: 0,W,X,Y,Z,New
A,0.327905,-0.737435,0.882221,0.261565,0.589471
B,-0.671143,0.733056,-0.546496,-2.145445,-2.816588
C,1.048582,-0.696129,-0.214958,-1.896903,-0.848322
D,-0.089408,0.030106,0.019952,-0.772886,-0.862293
E,2.526273,1.337585,-1.369646,0.664797,3.19107


### Removing Columns / Rows

In [31]:
df.drop('New', 1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
B,-0.671143,0.733056,-0.546496,-2.145445
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886
E,2.526273,1.337585,-1.369646,0.664797


In [32]:
df.shape

(5, 4)

In [33]:
len(df)

5

In [34]:
df.drop('E', 0)    # inplace=False by default

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
B,-0.671143,0.733056,-0.546496,-2.145445
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886


In [35]:
df

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
B,-0.671143,0.733056,-0.546496,-2.145445
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886
E,2.526273,1.337585,-1.369646,0.664797


### Slicing

In [36]:
# similar to filter in spark
df.loc['A']

W    0.327905
X   -0.737435
Y    0.882221
Z    0.261565
Name: A, dtype: float64

In [37]:
# index loc
df.iloc[2]

W    1.048582
X   -0.696129
Y   -0.214958
Z   -1.896903
Name: C, dtype: float64

In [38]:
df.loc['B', 'Y']

-0.5464955919070277

In [39]:
df.iloc[2, 1]

-0.6961292044600061

In [40]:
df.loc[['B', 'D', 'E'], ['W', 'Z']]

Unnamed: 0,W,Z
B,-0.671143,-2.145445
D,-0.089408,-0.772886
E,2.526273,0.664797


In [41]:
df.iloc[[1, 3], [2, 1]]

Unnamed: 0,Y,X
B,-0.546496,0.733056
D,0.019952,0.030106


In [42]:
df.loc[['B']]

Unnamed: 0,W,X,Y,Z
B,-0.671143,0.733056,-0.546496,-2.145445


In [43]:
df.loc[['B'], ['Y', 'Z']]

Unnamed: 0,Y,Z
B,-0.546496,-2.145445


### Conditional Selection

In [44]:
df

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
B,-0.671143,0.733056,-0.546496,-2.145445
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886
E,2.526273,1.337585,-1.369646,0.664797


In [45]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,False,True,True
B,False,True,False,False
C,True,False,False,False
D,False,True,True,False
E,True,True,False,True


In [47]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,0.327905,,0.882221,0.261565
B,,0.733056,,
C,1.048582,,,
D,,0.030106,0.019952,
E,2.526273,1.337585,,0.664797


In [48]:
df['W'] > 0

A     True
B    False
C     True
D    False
E     True
Name: W, dtype: bool

In [49]:
# remember spark filter? df.filter('W >0')
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
C,1.048582,-0.696129,-0.214958,-1.896903
E,2.526273,1.337585,-1.369646,0.664797


In [50]:
# df.filter('W >0').select('Y')
df[df['W'] > 0]['Y']

A    0.882221
C   -0.214958
E   -1.369646
Name: Y, dtype: float64

In [51]:
df[df['W'] > 0][['Y']]

Unnamed: 0,Y
A,0.882221
C,-0.214958
E,-1.369646


In [52]:
# df.filter('W >0').select('Y', 'X')
df[df['W'] > 0][['Y', 'X']]

Unnamed: 0,Y,X
A,0.882221,-0.737435
C,-0.214958,-0.696129
E,-1.369646,1.337585


In [53]:
df[(df['W'] > 0) & (df['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565


In [55]:
df[(df['W'] > 0) | (df['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
C,1.048582,-0.696129,-0.214958,-1.896903
D,-0.089408,0.030106,0.019952,-0.772886
E,2.526273,1.337585,-1.369646,0.664797


In [56]:
df[~(df['W'] <= 0)]

Unnamed: 0,W,X,Y,Z
A,0.327905,-0.737435,0.882221,0.261565
C,1.048582,-0.696129,-0.214958,-1.896903
E,2.526273,1.337585,-1.369646,0.664797


<hr/>

## 3. Concatenation

In [57]:
df1 = pd.DataFrame(
    {
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3'],
    }
)

df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [58]:
df2 = pd.DataFrame(
    {
        'A': ['A4', 'A5', 'A6', 'A7'],
        'B': ['B4', 'B5', 'B6', 'B7'],
        'C': ['C4', 'C5', 'C6', 'C7'],
        'D': ['D4', 'D5', 'D6', 'D7'],
    },
    index=[4, 5, 6, 7],
)

df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [59]:
df3 = pd.DataFrame(
    {
        'A': ['A8', 'A9', 'A10', 'A11'],
        'B': ['B8', 'B9', 'B10', 'B11'],
        'C': ['C8', 'C9', 'C10', 'C11'],
        'D': ['D8', 'D9', 'D10', 'D11'],
    },
    index=[8, 9, 10, 11],
)

df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [61]:
# spark: df1.union(df2).union(df3)
pd.concat([df1, df2, df3], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


## 4. Merge v.s. Join

* Merge based on column(s)
* Join based on index

In [62]:
left = pd.DataFrame(
    {
        'key': ['K0', 'K1', 'K2', 'K3'],
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3'],
    }
)

left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [63]:
right = pd.DataFrame(
    {
        'key': ['K0', 'K1', 'K2', 'K4'],
        'C': ['C0', 'C1', 'C2', 'C4'],
        'D': ['D0', 'D1', 'D2', 'D4'],
    }
)

right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K4,C4,D4


In pandas Dataframe, `.merge()` method is equivalent to `.join()` method in PySpark Dataframe. 

In [64]:
left.merge(right, how='inner', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2


In [65]:
left_2keys = pd.DataFrame(
    {
        'key1': ['K0', 'K0', 'K1', 'K2'],
        'key2': ['K0', 'K1', 'K0', 'K1'],
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3'],
    }
)

left_2keys

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3


In [66]:
right_2keys = pd.DataFrame(
    {
        'key1': ['K0', 'K1', 'K1', 'K2'],
        'key2': ['K0', 'K0', 'K0', 'K0'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3'],
    }
)

right_2keys

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [67]:
left_2keys.merge(right_2keys, how='inner', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [68]:
left1 = pd.DataFrame(
    {
        'A': ['A0', 'A1', 'A2'],
        'B': ['B0', 'B1', 'B2'],
    },
    index=['K0', 'K1', 'K2'],
)

left1

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [69]:
right1 = pd.DataFrame(
    {
        'C': ['C0', 'C2', 'C3'],
        'D': ['D0', 'D2', 'D3'],
    },
    index=['K0', 'K2', 'K3'],
)

right1

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In pandas Dataframe, `.join()` method is used to join two dataframes on **index**.

In [72]:
# join by index
left1.join(right1, how='inner')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K2,A2,B2,C2,D2


In [73]:
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [74]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K4,C4,D4


In [76]:
# add suffix if having same column names
left.join(right, how='inner', lsuffix='_l', rsuffix='_r')

Unnamed: 0,key_l,A,B,key_r,C,D
0,K0,A0,B0,K0,C0,D0
1,K1,A1,B1,K1,C1,D1
2,K2,A2,B2,K2,C2,D2
3,K3,A3,B3,K4,C4,D4


<hr/>

## 5. Group By

![groupby](./pics/groupby.png)

In [77]:
apple = pd.read_csv('./data/stocks/AAPL.csv')
google = pd.read_csv('./data/stocks/GOOG.csv')
ibm = pd.read_csv('./data/stocks/IBM.csv')
intel = pd.read_csv('./data/stocks/INTC.csv')
microsoft = pd.read_csv('./data/stocks/MSFT.csv')

In [80]:
apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200
2,2019-04-26,204.899994,205.0,202.119995,204.300003,201.660461,18649100
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900


In [81]:
apple['name'] = ['apple'] * len(apple)
google['name'] = ['google'] * len(google)
ibm['name'] = ['ibm'] * len(ibm)
intel['name'] = ['intel'] * len(intel)
microsoft['name'] = ['microsoft'] * len(microsoft)

In [82]:
apple.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple


In [83]:
stocks = pd.concat([apple, google, ibm, intel, microsoft], axis=0)
stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple
2,2019-04-26,204.899994,205.000000,202.119995,204.300003,201.660461,18649100,apple
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700,apple
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900,apple
...,...,...,...,...,...,...,...,...
248,2020-04-17,179.500000,180.000000,175.869995,178.600006,178.600006,52765600,microsoft
249,2020-04-20,176.630005,178.750000,174.990005,175.059998,175.059998,36669600,microsoft
250,2020-04-21,173.500000,173.669998,166.110001,167.820007,167.820007,56203700,microsoft
251,2020-04-22,171.389999,174.000000,170.820007,173.520004,173.520004,34651600,microsoft


### min, max, mean, std

In [88]:
stocks.groupby('name').min()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apple,2019-04-24,175.440002,177.919998,170.270004,173.300003,171.719727,11362000
google,2019-04-24,1042.900024,1047.48999,1013.536011,1036.22998,1036.22998,347500
ibm,2019-04-24,94.599998,97.739998,90.559998,94.769997,94.769997,1202100
intel,2019-04-24,43.349998,44.459999,42.860001,43.459999,42.721867,6313200
microsoft,2019-04-24,121.279999,123.279999,119.010002,119.839996,118.712952,8989200


In [89]:
stocks.groupby('name').max()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apple,2020-04-23,324.73999,327.850006,323.350006,327.200012,327.200012,106721200
google,2020-04-23,1525.069946,1532.105957,1521.400024,1526.689941,1526.689941,6207000
ibm,2020-04-23,156.820007,158.75,155.419998,156.759995,155.309998,18994600
intel,2020-04-23,67.629997,69.290001,67.309998,68.470001,68.13446,84711000
microsoft,2020-04-23,190.649994,190.699997,186.470001,188.699997,188.185989,97073600


In [90]:
stocks.groupby('name').mean()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
apple,242.473794,245.527035,240.031977,242.981027,241.944647,33342260.0
google,1243.948991,1256.93339,1232.410592,1245.498577,1245.498577,1680308.0
ibm,135.056641,136.353241,133.703755,135.092293,132.558535,4522342.0
intel,53.479209,54.21415,52.850751,53.537391,53.088899,24354070.0
microsoft,146.762372,148.303992,145.12,146.787905,146.03863,31459660.0


In [91]:
stocks.groupby('name').std()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
apple,42.475337,43.231226,41.969801,42.831526,43.373058,17331680.0
google,118.526547,117.899965,119.63469,118.782639,118.782639,850552.9
ibm,11.110708,10.529408,11.563389,10.995356,10.295352,2740329.0
intel,6.264857,6.252973,6.236472,6.266022,6.455097,11530500.0
microsoft,15.731724,16.110413,15.307998,15.785702,16.159875,18080060.0


In [92]:
stocks.groupby('name').min()[['Open', 'Close']]

Unnamed: 0_level_0,Open,Close
name,Unnamed: 1_level_1,Unnamed: 2_level_1
apple,175.440002,173.300003
google,1042.900024,1036.22998
ibm,94.599998,94.769997
intel,43.349998,43.459999
microsoft,121.279999,119.839996


## Ohter useful operations

In [93]:
stocks['name'].unique()

array(['apple', 'google', 'ibm', 'intel', 'microsoft'], dtype=object)

In [95]:
stocks['Date'].nunique()
# len(stocks['Date'].unique())

253

In [96]:
stocks[stocks['name'] == 'apple']

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple
2,2019-04-26,204.899994,205.000000,202.119995,204.300003,201.660461,18649100,apple
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700,apple
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900,apple
...,...,...,...,...,...,...,...,...
248,2020-04-17,284.690002,286.950012,276.859985,282.799988,282.799988,53812500,apple
249,2020-04-20,277.950012,281.679993,276.850006,276.929993,276.929993,32503800,apple
250,2020-04-21,276.279999,277.250000,265.429993,268.369995,268.369995,45247900,apple
251,2020-04-22,273.609985,277.899994,272.200012,276.100006,276.100006,29264300,apple


In [97]:
stocks[(stocks['name'] == 'apple') | (stocks['name'] == 'ibm')]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple
2,2019-04-26,204.899994,205.000000,202.119995,204.300003,201.660461,18649100,apple
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700,apple
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900,apple
...,...,...,...,...,...,...,...,...
248,2020-04-17,119.300003,120.389999,117.919998,120.120003,120.120003,4966000,ibm
249,2020-04-20,119.150002,122.860001,118.139999,120.410004,120.410004,8181700,ibm
250,2020-04-21,114.000000,117.150002,112.059998,116.760002,116.760002,14349000,ibm
251,2020-04-22,119.870003,120.330002,117.550003,119.309998,119.309998,7087900,ibm


In [98]:
stocks['boom'] = (stocks['Close'] - stocks['Open']) / stocks['Open'] > 0.1

In [99]:
stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name,boom
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple,False
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple,False
2,2019-04-26,204.899994,205.0,202.119995,204.300003,201.660461,18649100,apple,False
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700,apple,False
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900,apple,False


In [100]:
stocks[stocks['boom']]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name,boom
224,2020-03-13,48.259998,55.0,47.740002,54.43,54.43,48805100,intel,True


In [101]:
stocks['bam'] = (stocks['Close'] - stocks['Open']) / stocks['Open'] < -0.07

In [102]:
stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name,boom,bam
0,2019-04-24,207.360001,208.479996,207.050003,207.160004,204.483505,17540600,apple,False,False
1,2019-04-25,206.830002,207.759995,205.119995,205.279999,202.627808,18543200,apple,False,False
2,2019-04-26,204.899994,205.0,202.119995,204.300003,201.660461,18649100,apple,False,False
3,2019-04-29,204.399994,205.970001,203.860001,204.610001,201.966461,22204700,apple,False,False
4,2019-04-30,203.059998,203.399994,199.110001,200.669998,198.077362,46534900,apple,False,False


In [103]:
stocks[stocks['bam']]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,name,boom,bam
229,2020-03-20,247.179993,251.830002,228.0,229.240005,229.240005,100423300,apple,False,True


<hr/>

## Read Dataframe from HTML

In [111]:
import warnings
warnings.filterwarnings('ignore')

In [112]:
from selenium import webdriver

In [142]:
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get('http://www.fdic.gov/bank/individual/failed/banklist.html')
html_1 = driver.page_source
dfs = pd.read_html(html_1)

In [143]:
len(dfs)

1

In [144]:
dfs[0]

Unnamed: 0,Bank NameBank,CityCity,StateSt,CertCert,Acquiring InstitutionAI,Closing DateClosing,FundFund
0,Almena State Bank,Almena,KS,15426,Equity Bank,"October 23, 2020",10538
1,First City Bank of Florida,Fort Walton Beach,FL,16748,"United Fidelity Bank, fsb","October 16, 2020",10537
2,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020",10536
3,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020",10535
4,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019",10534
5,Resolute Bank,Maumee,OH,58317,Buckeye State Bank,"October 25, 2019",10533
6,Louisa Community Bank,Louisa,KY,58112,Kentucky Farmers Bank Corporation,"October 25, 2019",10532
7,The Enloe State Bank,Cooper,TX,10716,"Legend Bank, N. A.","May 31, 2019",10531
8,Washington Federal Bank for Savings,Chicago,IL,30570,Royal Savings Bank,"December 15, 2017",10530
9,The Farmers and Merchants State Bank of Argonia,Argonia,KS,17719,Conway Bank,"October 13, 2017",10529


In [130]:
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get('https://finance.yahoo.com/quote/IBM?p=IBM&.tsrc=fin-srch')
html_2 = driver.page_source
tables = pd.read_html(html_2)

In [135]:
len(tables)

2

In [136]:
tables[0].set_index(0, drop=True).transpose()

Unnamed: 0,Previous Close,Open,Bid,Ask,Day's Range,52 Week Range,Volume,Avg. Volume
1,139.85,139.7,138.00 x 800,138.14 x 800,137.35 - 140.44,114.56 - 146.12,6508599,5210264


In [137]:
tables[1].rename(columns={0: 'label', 1: 'value'}).set_index('label', drop=True)

Unnamed: 0_level_0,value
label,Unnamed: 1_level_1
Market Cap,123.916B
Beta (5Y Monthly),1.10
PE Ratio (TTM),21.78
EPS (TTM),6.35
Earnings Date,"Jul 18, 2022"
Forward Dividend & Yield,6.56 (4.75%)
Ex-Dividend Date,"Feb 10, 2022"
1y Target Est,144.16


In [145]:
dfs[0].to_csv('./data/banklist.csv', index=False)

In [146]:
!ls -l ./data/

total 20680
-rw-r--r-- 1 Meeno Meeno    21651 Oct  3  2019 P4-Movie-Ratings.csv
-rw-r--r-- 1 Meeno Meeno    80978 Oct  6  2019 Project-5-dataset.csv
-rw-r--r-- 1 Meeno Meeno  1008432 Jun  2  2020 WHO-COVID-19-global-data.csv
-rw-r--r-- 1 Meeno Meeno     1215 Apr 25 11:48 banklist.csv
-rw-r--r-- 1 Meeno Meeno  2745852 Aug 31  2018 commerce.csv
-rw-r--r-- 1 Meeno Meeno     2009 Apr 24 18:53 countryMap.csv
-rw-r--r-- 1 Meeno Meeno     4922 Jun  3  2020 countryMap.txt
-rw-r--r-- 1 Meeno Meeno    68733 Nov 13  2019 news.csv
-rw-r--r-- 1 Meeno Meeno   167283 Oct 25  2019 opera-house.jpg
-rw-r--r-- 1 Meeno Meeno   548918 Apr 24 18:52 population.csv
-rw-r--r-- 1 Meeno Meeno 16239776 Aug 31  2018 salaries.csv
drwxr-xr-x 1 Meeno Meeno        0 Apr 24  2020 stocks
-rw-r--r-- 1 Meeno Meeno   258560 Apr 24 13:51 world-population.xls


In [147]:
!head -n 5 ./data/banklist.csv

Bank NameBank,CityCity,StateSt,CertCert,Acquiring InstitutionAI,Closing DateClosing,FundFund
Almena State Bank,Almena,KS,15426,Equity Bank,"October 23, 2020",10538
First City Bank of Florida,Fort Walton Beach,FL,16748,"United Fidelity Bank, fsb","October 16, 2020",10537
The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020",10536
Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020",10535
