##### Python Basics

In [1]:
import numpy as np

- __List__
    - A list in Python is a collection of items which can contain elements of multiple data types, which may be either numeric, character logical values, etc.
    
- __array__
    - An array is a vector containing homogeneous elements i.e. belonging to the same data type.

In [2]:
data1 = [1,2,3,4,5]      #list
print(data1)
arr1 = np.array(data1)
print(arr1)

[1, 2, 3, 4, 5]
[1 2 3 4 5]


In [3]:
sample_list = [2,'bash',[2,3,4]]
print(sample_list)

[2, 'bash', [2, 3, 4]]


- Creating arrays

In [4]:
dat1 = np.zeros(5)
print(dat1)

dat11 = np.arange(5)
print(dat11)

dat2 = np.ones((3,6))
print(dat2)

dat3 = np.linspace(1,9,5)  #1 to 9 (inclusive) with 5 points
print(dat3)

dat4 = np.logspace(0,3,4)  # 10^0 to 10^3 (inclusive) with 4 points
print(dat4)

[0. 0. 0. 0. 0.]
[0 1 2 3 4]
[[1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]]
[1. 3. 5. 7. 9.]
[   1.   10.  100. 1000.]


- Examining the arrays

In [5]:
dtype1 = (dat1.dtype)
print(dtype1)

print(arr1.dtype)

print(dat2.ndim)   #2 

print(dat2.shape)   #(3,6) - axis 0 is rows, axis 1 is columns

print(dat2.size)   # total number of elements

print(len(dat2))  # size of the first dimension (3,6)

float64
int64
2
(3, 6)
18
3


- Reshaping the array

In [6]:
arr = np.arange(10, dtype=float)
print(arr, arr.shape)

arr1 = (arr.reshape((2,5)))
print(arr1, arr1.shape)

[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.] (10,)
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]] (2, 5)


- Transpose of a array

In [7]:
arr2 = arr1.T
print(arr2)

[[0. 5.]
 [1. 6.]
 [2. 7.]
 [3. 8.]
 [4. 9.]]


- __Flatten:__
    always returns a flat copy of the original array

In [8]:
arr1_flt = arr1.flatten()
print(arr1_flt)

#replacing first element in array
arr1_flt[0] = 23
print(arr1_flt)

#printing particular element in a array
print(arr1_flt[2])

#printing some element in a array
print(arr1_flt[5:8])

#printing lat element in an array
print(arr1_flt[-1])

#print except lat element
print(arr1_flt[:-1])

[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
[23.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
2.0
[5. 6. 7.]
9.0
[23.  1.  2.  3.  4.  5.  6.  7.  8.]


- __For 2D arrays, sequential move in the memory will:__
    - iterate over rows (axis 0)
        - iterate over columns (axis 1)
        
- __For 3D arrays, sequential move in the memory will:__
    - iterate over plans (axis 0)
        - iterate over rows (axis 1)
            - iterate over columns (axis 2)

In [9]:
xx = np.arange(2 * 3 * 4)
print(xx)

#Reshaping into 3D (axis 0, axis 1, axis 2)
xx = xx.reshape(2,3,4)
print("3D array")
print(xx)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
3D array
[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]


In [10]:
#slection get first plan
print(xx[0, :, :])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [11]:
#selection first rows
print(xx[:, 1, :])

[[ 4  5  6  7]
 [16 17 18 19]]


In [12]:
#selection get second columns
print(xx[:,:,2])

[[ 2  6 10]
 [14 18 22]]


**print the original array using RAVEL**

In [13]:
print(xx.ravel())

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]


**Stack arrays**

In [14]:
a = np.array([0, 1])
b = np.array([2, 3])

ab = np.stack((a,b))
print(ab)

[[0 1]
 [2 3]]


**Selection**

In [15]:
data = np.arange(10, dtype=float).reshape((2,5))
print(data)
print(data.shape)

print(data[0]) 
print(data[0, 3]) #row 0, column 3
print(data[0][3]) # alternative syntax

[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]
(2, 5)
[0. 1. 2. 3. 4.]
3.0
3.0


**Slicing**

- Syntax: start:stop:step with start (default 0) stop (default last) step (default 1)

In [16]:
print(data[0, :])  #row 0: returns 1d array

print(data[:, 0]) #column 0

print(data[:, :2]) # first two columns

print(data[:, 2:]) #columns after index 2 included

data2 = data[:, 1:4] # columns between index 1 (included) and 4 (excluded)
print(data2)

#Row 0 reverse order
print(data[0, ::-1])

[0. 1. 2. 3. 4.]
[0. 5.]
[[0. 1.]
 [5. 6.]]
[[2. 3. 4.]
 [7. 8. 9.]]
[[1. 2. 3.]
 [6. 7. 8.]]
[4. 3. 2. 1. 0.]


In [17]:
ndata = data[:, [1,2,3]] #return a copy
print(ndata)

ndata[0,1] = 33
print(ndata)

[[1. 2. 3.]
 [6. 7. 8.]]
[[ 1. 33.  3.]
 [ 6.  7.  8.]]


In [18]:
#Boolean arrays indexing
print(data)
ndat1 = data[data>5] # reurn a copy
print(ndat1)

[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]
[6. 7. 8. 9.]


In [19]:
newdat = data #return a copy into newdat
print(newdat)

# Boolean indexing
newdat[newdat > 5] =0

[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [20]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob'])
print(names)

names[names != 'Bob'] = 'Joe'
print(names)

print(np.unique(names))

['Bob' 'Joe' 'Will' 'Bob']
['Bob' 'Joe' 'Joe' 'Bob']
['Bob' 'Joe']


- __list:__
    _is a collection which is ordered and changeable. Allows duplicate memebers_

In [21]:
list1 = ["apple", "banana", "cherry"]
list2 = [1, 5, 7, 9, 3]
list3 = [True, False, False]
list4 = ["abc", 34, True, 40, "male"]
print(list4)

['abc', 34, True, 40, 'male']


- __Tuple:__ _is a collection which is unordered and unchangeable. Allows duplicate members_

In [22]:
tuple1 = ("apple", "banana", "cherry")
tuple2 = (1, 5, 7, 9, 3)
tuple3 = (True, False, False)
tuple4 = ("abc", 34, True, 40, "male")
print(tuple4)

('abc', 34, True, 40, 'male')


In [23]:
thistuple = tuple(("apple", "banana", "cherry"))
print(thistuple)

('apple', 'banana', 'cherry')


- __Set:__ _is a collection which is unordered and unindexed. No duplicate memebers_

In [24]:
set1 = {"apple", "banana", "cherry", "Cherry"} #= {"apple", "banana", "cherry"}
set2 = {1, 5, 7, 7,1,9, 3} #= {1,5,7,9,3}
set3 = {True, False, False} #= {True, False}


In [25]:
thisset = set(("apple", "banana", "cherry","cherry"))
print(thisset)

{'cherry', 'apple', 'banana'}


- __Dictionary:__ _is a collection which is unordered and changeable. No duplicate memebers_

In [26]:
thisdict =	{
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}
print(thisdict)

{'brand': 'Ford', 'model': 'Mustang', 'year': 1964}


In [27]:
print(thisdict["brand"])

Ford


**Vectorized operations**

In [28]:
import numpy as np

nums = np.arange(5)
nums * 10 # multiply each element by 10
print(nums)

nums = np.sqrt(nums) # square root of each element
print(nums)

numceil = np.ceil(nums) # also floor, rint (round to nearest int)
print(numceil)

nancheck = np.isnan(nums) # checks for NaN
print(nancheck)

newnums = nums + np.arange(5) # add element-wise
print(newnums)

np.maximum(nums, np.array([1, -2, 3, -4, 5])) # compare element-wise


[0 1 2 3 4]
[0.         1.         1.41421356 1.73205081 2.        ]
[0. 1. 2. 2. 2.]
[False False False False False]
[0.         2.         3.41421356 4.73205081 6.        ]


array([1.        , 1.        , 3.        , 1.73205081, 5.        ])

**Math and Stats**

In [29]:
dta = np.random.randn(4,2)
print(dta)
print("\n")

print("Mean: ",dta.mean())
print("Std: ", dta.std())
print("Index of minimum element: ", dta.argmin())
print("Sum: ",dta.sum())
print("Sum of columns: ",dta.sum(axis=0))
print("Sum of rows: ", dta.sum(axis=1))

[[-0.97203667 -0.67516424]
 [-0.35380944 -2.6157801 ]
 [-0.00757063 -0.01173546]
 [ 0.89980944  0.51868443]]


Mean:  -0.40220033369365543
Std:  1.0095143282207975
Index of minimum element:  3
Sum:  -3.2176026695492435
Sum of columns:  [-0.43360729 -2.78399538]
Sum of rows:  [-1.64720091 -2.96958954 -0.01930609  1.41849387]


**Methods for boolean arrays**

In [30]:
print("Count number of positive values: ",(dta > 0).sum())
print("Check if any value is True: ",(dta >0).any())
print("Check if all values are True: ", (dta > 0).all())

Count number of positive values:  2
Check if any value is True:  True
Check if all values are True:  False


**Create Random numbers**

In [31]:
np.random.seed(12234)
#2 x 3 matrix in [0, 1]
datt1 =np.random.rand(2, 3)
print(datt1)

#random normals (mean 0 and sd 1)
datt2 = np.random.randn(4)
print(datt2)

#5 randomly picked 0 or 1
datt3 = np.random.randint(0, 2, 5)
print(datt3)

[[0.00630595 0.20303476 0.76478993]
 [0.55513384 0.74358546 0.93777808]]
[-0.27996207  1.31281104 -0.92715578 -0.40130217]
[1 0 0 0 1]


##### Pandas Data frames

In [32]:
import pandas as pd
df = pd.DataFrame({"a" : [1,2,3], "b" :[4,5,6], "c":[10,11,12]})
df

Unnamed: 0,a,b,c
0,1,4,10
1,2,5,11
2,3,6,12


In [33]:
A = [[1,2],[3,4],[5,6]]
idx = ['coper1','coper2','coper3']
cols = ['asset1','asset2']

df2 = pd.DataFrame(A,index = idx, columns = cols)
df2

Unnamed: 0,asset1,asset2
coper1,1,2
coper2,3,4
coper3,5,6


- __Create a DataFrame df from this dictionary data which has the index labels.__

In [34]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df3 = pd.DataFrame(data, index=labels)
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


- Display the data frame information

In [35]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


- display the first 3 rows of the Data frame 

In [36]:
# df3.head(3)
# or
df3.iloc[:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [37]:
df3.tail()

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


- Display the statistics

In [38]:
df3.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


- Select just 'animal' and 'age' columns from the DataFrame

In [39]:
# df3.loc[:,['animal','age']]
# or
df3[['animal','age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


- Select the data in rows [3, 4, 8] and in columns ['animal', 'age']

In [40]:
df3.loc[df3.index[[3,4,8]],['animal','age']]

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


- Select only the rows where the number of visits is greater than 3

In [41]:
df3[df3['visits']>2]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
d,dog,,3,yes
f,cat,2.0,3,no


In [42]:
# Pandas_rearange_tables.ipynb

In [43]:
#creating a another DataFrame
dfm = pd.DataFrame({'month':['Jan','Feb', 'Mar', 'Apr', 'May', 'Jun'],
                  'eggs':[47,110,221,77,132,205],
                  'chocos':[12.0,50.0,89.,87.0,'nan',60.0],
                  'bikis':[17,31,72,20,52,55]})
dfm

Unnamed: 0,month,eggs,chocos,bikis
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [44]:
#interchange columns
dfm_new = dfm[["month","chocos","bikis","eggs"]]
dfm_new

Unnamed: 0,month,chocos,bikis,eggs
0,Jan,12.0,17,47
1,Feb,50.0,31,110
2,Mar,89.0,72,221
3,Apr,87.0,20,77
4,May,,52,132
5,Jun,60.0,55,205


In [45]:
# picking the particular value
pck1 = dfm.iloc[4,3]
print(pck1)

52


In [46]:
#picking some values
pck2 = dfm['eggs'][1:4]
pck2

1    110
2    221
3     77
Name: eggs, dtype: int64

In [47]:
dfm

Unnamed: 0,month,eggs,chocos,bikis
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [48]:
#renaming the column name
dfm1 = dfm.rename(columns = {'eggs':"candys"})
dfm1

Unnamed: 0,month,candys,chocos,bikis
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [None]:
# unsupported operand type(s) for +: 'int' and 'str'
# dfm['all'] = dfm['eggs']+dfm['chocos']+dfm['bikis']

In [49]:
dfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   month   6 non-null      object
 1   eggs    6 non-null      int64 
 2   chocos  6 non-null      object
 3   bikis   6 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 320.0+ bytes


In [53]:
#chocos data type is object
#change to float
dfm['chocos'] = dfm['chocos'].astype(float)
dfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   month   6 non-null      object 
 1   eggs    6 non-null      int64  
 2   chocos  5 non-null      float64
 3   bikis   6 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 320.0+ bytes


In [55]:
#row / observations sum and create a new column
dfm['all'] = dfm['eggs']+dfm['chocos']+dfm['bikis']
dfm

Unnamed: 0,month,eggs,chocos,bikis,all
0,Jan,47,12.0,17,76.0
1,Feb,110,50.0,31,191.0
2,Mar,221,89.0,72,382.0
3,Apr,77,87.0,20,184.0
4,May,132,,52,
5,Jun,205,60.0,55,320.0


In [59]:
#display what are the columns or features
colmns = dfm.columns
colmns
# list(colmns)

Index(['month', 'eggs', 'chocos', 'bikis', 'all'], dtype='object')

In [63]:
#grouping 
group_single = dfm.groupby('month').agg({'eggs':['mean','min','max']})
group_single

Unnamed: 0_level_0,eggs,eggs,eggs
Unnamed: 0_level_1,mean,min,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Apr,77,77,77
Feb,110,110,110
Jan,47,47,47
Jun,205,205,205
Mar,221,221,221
May,132,132,132


In [64]:
dfm.describe()

Unnamed: 0,eggs,chocos,bikis,all
count,6.0,5.0,6.0,5.0
mean,132.0,59.6,41.166667,230.6
std,69.258934,31.516662,21.885307,120.990909
min,47.0,12.0,17.0,76.0
25%,85.25,50.0,22.75,184.0
50%,121.0,60.0,41.5,191.0
75%,186.75,87.0,54.25,320.0
max,221.0,89.0,72.0,382.0


In [67]:
dfm.chocos.std()

31.51666225982694