# NumPy

### Importing

In [103]:
import numpy as np

### Creating NumPy array

In [104]:
np.array([1, 2, 3])

array([1, 2, 3])

**Creating NumPy "matrix"**

In [105]:
np.array([[1,2,3],[4,5,6],[7,8,9]])

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

**Creating range of values**

In [106]:
np.arange(1, 10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

**Creating array of zeros**

In [107]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

**Creating array of ones**

In [108]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

**Creating matrix of zeros**

In [109]:
np.zeros((3, 3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

**Creating array of evenly spaced numbers over a interval**

In [110]:
np.linspace(0, 10, 5)

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

**Creating identity matrix**

In [111]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

### Random

**Creating array with random samples from a uniform distribution [0, 1)**

In [112]:
np.random.rand(2)

array([0.71381756, 0.43899287])

**Creating matrix with random samples from a uniform distribution [0,1)**

In [113]:
np.random.rand(3, 3)

array([[0.75919733, 0.0679739 , 0.26547589],
       [0.51774124, 0.46453701, 0.56789173],
       [0.26463526, 0.54875296, 0.77185625]])

**Array with random samples from a normal distribution**

In [114]:
np.random.randn(2)

array([ 1.747555  , -0.07339614])

**Creating matrix with random samples from a normal distribution**

In [115]:
np.random.randn(3, 3)

array([[ 1.33835193, -0.17768739, -1.50243489],
       [-1.31896222, -1.03523424, -0.96602063],
       [ 0.59251914, -1.13947602, -1.76555754]])

**Creating random integer from [low, high)**

In [116]:
np.random.randint(1, 100)

97

**Creating array with integers**

In [117]:
np.random.randint(1, 100, 10)

array([27, 87,  9, 34,  9, 89, 74, 91, 45, 42])

### Others

**Reshaping matrix**

In [118]:
np.arange(9).reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

**Retrieves max value**

In [119]:
arr = np.random.randint(1, 100, 16)
arr

array([34, 48, 56, 72, 92, 56, 28, 62,  4, 72, 71, 21, 68, 55, 17, 72])

In [120]:
arr.max()

92

**Retrieving min value**

In [121]:
arr.min()

4

**Retrieving index of maximum value**

In [122]:
arr.argmax()

4

**Retrieving index of minimum value**

In [123]:
arr.argmin()

8

**Reshaping**

In [124]:
arr = arr.reshape(4,4)
arr

array([[34, 48, 56, 72],
       [92, 56, 28, 62],
       [ 4, 72, 71, 21],
       [68, 55, 17, 72]])

**Retrieving shape**

In [125]:
arr.shape

(4, 4)

**Retrieving type**

In [126]:
arr.dtype

dtype('int32')

**Fancy indexing**

In [127]:
arr[[0,2,3]]

array([[34, 48, 56, 72],
       [ 4, 72, 71, 21],
       [68, 55, 17, 72]])

### Selection

**Retrieving booleans based on condition**

In [128]:
arr > 50

array([[False, False,  True,  True],
       [ True,  True, False,  True],
       [False,  True,  True, False],
       [ True,  True, False,  True]])

**Retrieving values based on condition**

In [129]:
arr[arr > 50]

array([56, 72, 92, 56, 62, 72, 71, 68, 55, 72])

### Array functions

**Calculating square root**

In [130]:
np.sqrt(arr)

array([[5.83095189, 6.92820323, 7.48331477, 8.48528137],
       [9.59166305, 7.48331477, 5.29150262, 7.87400787],
       [2.        , 8.48528137, 8.42614977, 4.58257569],
       [8.24621125, 7.41619849, 4.12310563, 8.48528137]])

**Calculating sin**

In [131]:
np.sin(arr)

array([[ 0.52908269, -0.76825466, -0.521551  ,  0.25382336],
       [-0.77946607, -0.521551  ,  0.27090579, -0.7391807 ],
       [-0.7568025 ,  0.25382336,  0.95105465,  0.83665564],
       [-0.89792768, -0.99975517, -0.96139749,  0.25382336]])

# Pandas

### Importing

In [132]:
import pandas as pd

### Series

**Creating series using list**

In [133]:
pd.Series(data=[10, 20, 30], index=['a', 'b', 'c'])

a    10
b    20
c    30
dtype: int64

**Creating series using numpy array**

In [134]:
pd.Series(np.array([10, 20, 30]), index=['a', 'b', 'c'])

a    10
b    20
c    30
dtype: int32

**Creating series using dictionary**

In [135]:
pd.Series({'a':10, 'b':20, 'c':30})

a    10
b    20
c    30
dtype: int64

### Dataframes

**Creating a dataframe**

In [136]:
df = pd.DataFrame(np.random.randn(4,3), index=['A', 'B', 'C', 'D'], columns=['X', 'Y', 'Z'])
df

Unnamed: 0,X,Y,Z
A,-1.095055,-1.083413,-2.05536
B,0.314936,-0.197496,0.384491
C,-1.002759,0.274013,-0.326654
D,-0.178329,-0.94282,-0.877257


**Selecting multiple lines**

In [137]:
df[['X', 'Z']]

Unnamed: 0,X,Z
A,-1.095055,-2.05536
B,0.314936,0.384491
C,-1.002759,-0.326654
D,-0.178329,-0.877257


**Creating a new column**

In [138]:
df['New'] = df['X'] + df['Y']
df

Unnamed: 0,X,Y,Z,New
A,-1.095055,-1.083413,-2.05536,-2.178468
B,0.314936,-0.197496,0.384491,0.11744
C,-1.002759,0.274013,-0.326654,-0.728745
D,-0.178329,-0.94282,-0.877257,-1.121149


**Removing columns**

In [139]:
df.drop('New', axis=1, inplace=True)

**Removing rows**

In [140]:
df.drop('D', axis=0, inplace=True)

**Selecting row by label**

In [141]:
df.loc['C']

X   -1.002759
Y    0.274013
Z   -0.326654
Name: C, dtype: float64

**Selecting by index**

In [142]:
df.iloc[2]

X   -1.002759
Y    0.274013
Z   -0.326654
Name: C, dtype: float64

**Selecting an element**

In [143]:
df.iloc[0, 2]

-2.0553604439053417

**Selecting multiple elements**

In [144]:
df.iloc[:2, 1:]

Unnamed: 0,Y,Z
A,-1.083413,-2.05536
B,-0.197496,0.384491


**Conditional selection**

In [145]:
df[(df['X'] < 0) & (df['Y'] > 0)]

Unnamed: 0,X,Y,Z
C,-1.002759,0.274013,-0.326654


**Reseting index**

In [146]:
df.reset_index()

Unnamed: 0,index,X,Y,Z
0,A,-1.095055,-1.083413,-2.05536
1,B,0.314936,-0.197496,0.384491
2,C,-1.002759,0.274013,-0.326654


**Setting index**

In [147]:
df['New_Index'] = ['K', 'L', 'M']
df.set_index('New_Index', inplace=True)
df

Unnamed: 0_level_0,X,Y,Z
New_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
K,-1.095055,-1.083413,-2.05536
L,0.314936,-0.197496,0.384491
M,-1.002759,0.274013,-0.326654


### Missing data

In [148]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


**Dropping missing values**

In [149]:
df.dropna(axis=0, thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


**Filling missing values**

In [150]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


### Groupby

In [151]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


**Grouping with aggregate function**

In [152]:
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


**Grouping with describe**

In [153]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


### Merging, joining, and concatenating

**Concatenating**

In [154]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])
print(df1)
print(df2)
print(df3)

    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3
    A   B   C   D
4  A4  B4  C4  D4
5  A5  B5  C5  D5
6  A6  B6  C6  D6
7  A7  B7  C7  D7
      A    B    C    D
8    A8   B8   C8   D8
9    A9   B9   C9   D9
10  A10  B10  C10  D10
11  A11  B11  C11  D11


In [155]:
pd.concat([df1, df2, df3], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


**Merging**

In [156]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                         'C': ['C0', 'C1', 'C2', 'C3'],
                         'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3


In [157]:
pd.merge(left, right, how='inner', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


**Joining**

In [158]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
print(left)
print(right)

     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3


In [159]:
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


### Operations

In [160]:
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


**Retrieves unique elements**

In [161]:
df['col2'].unique()

array([444, 555, 666], dtype=int64)

**Retrieves number of unique elements**

In [162]:
df['col2'].nunique()

3

**Retrieves number of unique elemens by unique elements**

In [163]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

**Applying functions**

In [164]:
df['col1'].apply(lambda x : x * 2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

**Retrieving columns**

In [165]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

**Retrieving indexes**

In [166]:
df.index

RangeIndex(start=0, stop=4, step=1)

**Ordering / Sorting**

In [167]:
df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


**Checking for null values**

In [168]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


**Describing the dataframe**

In [169]:
df.describe()

Unnamed: 0,col1,col2
count,4.0,4.0
mean,2.5,527.25
std,1.290994,106.274409
min,1.0,444.0
25%,1.75,444.0
50%,2.5,499.5
75%,3.25,582.75
max,4.0,666.0


**Retrieving information**

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
col1    4 non-null int64
col2    4 non-null int64
col3    4 non-null object
dtypes: int64(2), object(1)
memory usage: 176.0+ bytes


### Data input and output

**Reading from csv**

In [171]:
# df = pd.read_csv('file_name.csv')

**Writing to a csv**

In [172]:
# df.to_csv('file_name.csv', index=False)

**Reading from excel**

In [173]:
# df.read_excel('file_name.xlsx', sheetname='sheet_name')

**Writing to excel**

In [174]:
# df.to_excel('file_name', sheet_name='sheet_name')

**Reading from HTML**

In [175]:
df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
df[0].head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date
0,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020"
1,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020"
2,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019"
3,Resolute Bank,Maumee,OH,58317,Buckeye State Bank,"October 25, 2019"
4,Louisa Community Bank,Louisa,KY,58112,Kentucky Farmers Bank Corporation,"October 25, 2019"


# Python

**Lambda**

In [6]:
x = lambda a : a + 10
x(5)

15

**Filtering**

In [3]:
list(filter(lambda x: x*2, [1, 2, 3]))

[1, 2, 3]

**List comprehension**

In [176]:
[x * 2 for x in [1, 2, 3] if x > 1]

[4, 6]

**Removing punctuations from a string**

In [177]:
import string
'...Str@ing!'.translate(str.maketrans('', '', string.punctuation))

'String'

**Counting how many times a string appears in a column regardless of any characters before and after the string**

In [178]:
df = pd.DataFrame(pd.Series(['@!,StrIng.>*', '...NotString...', 'Kartoffeln']), columns=['X'])
sum(df['X'].str.contains('String', case=False))

2

**Correlation between two columns**

In [179]:
# df[['col_name1', 'col_name2']].corr()