In [7]:
# numpy and scipy practice
# numpy arrays are sets of N dimensional points and perform operations on them


In [8]:
import numpy as np

In [9]:
# 1D array is similar to a list
array1d =np.array([0,2,4,6,8])
print (array1d)

[0 2 4 6 8]


In [10]:
# 2D array is similar to grid
array2d = np.array([[0,2,4,6,8],[0,2,4,6,8]])
print (array2d)

[[0 2 4 6 8]
 [0 2 4 6 8]]


In [11]:
# type returns the data type 
print (type(array1d))
print (type(array2d))
#shape is a tuple which contains the size of the array ( #rows, #cols)
print (array1d.shape)
print (array2d.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(5,)
(2, 5)


In [12]:
# element indexing done similar to lists in py
print (array1d[0])
print (array2d[0,0])

0
0


In [13]:
# py list synthax also works with : to specify from begining to end, etc
# colon includes starting point, but excludes ending point
print (array1d[1:]) # returns all elements starting with 2nd


[2 4 6 8]


In [14]:
print (array2d[1,1:]) #returns elements in the 2nd row 

[2 4 6 8]


In [15]:
# result of numpy array is a subset of original array
subarray2d = array2d[1,1:]
print (type(subarray2d))

<class 'numpy.ndarray'>


In [16]:
#other ways to index numpy arrays
#symmetrical arrays
lists = [[1,2,3,4,5,6,7],[1,2,3,4,5,6,7],[1,2,3,4,5,6,7]]
new2dArray=np.array(lists)
newSubArray=new2dArray[[1,0,2],[2,4,3]]
newSubArray

#nonsymmetrical arrays
length = len(sorted(lists,key=len, reverse=True)[0])
new2dArray=np.array([elems+[None]*(length-len(elems)) for elems in lists])
newSubArray=new2dArray[[1,0,2],[2,4,3]]
newSubArray

array([3, 5, 4])

In [20]:
#conditional arrays
newSubArray=[new2dArray>3]
newSubArray

[array([[False, False, False,  True,  True,  True,  True],
        [False, False, False,  True,  True,  True,  True],
        [False, False, False,  True,  True,  True,  True]], dtype=bool)]

In [29]:
#array creation via np, size passed in as a tuple
#array of 0
arrayOfZeros = np.zeros((2,2), dtype='int64')

#array of 1
#default value is float
arrayOfOnes = np.ones((1,2))

# array with same size and shape as passed in array, filled with specified element
arrayOfSevens = np.full(new2dArray.shape, 7)

# 2x2 identity matrix, all diagonal elements are 1 and all non diag are 0
identityMatrix = np.eye(2)

print (arrayOfZeros,'\n', arrayOfOnes, '\n',arrayOfSevens,'\n', identityMatrix)

[[0 0]
 [0 0]] 
 [[ 1.  1.]] 
 [[7 7 7 7 7 7 7]
 [7 7 7 7 7 7 7]
 [7 7 7 7 7 7 7]] 
 [[ 1.  0.]
 [ 0.  1.]]


In [31]:
# array of random numbers, numpy documentation for different distributions
arrayOfRandom = np.random.random((2,2))
arrayOfRandom

array([[ 0.42573945,  0.63049447],
       [ 0.22158613,  0.72614589]])

In [38]:
#change array shape and size
transposeArray = np.transpose(new2dArray)
print (new2dArray.shape, transposeArray.shape)
#can reshape into any shape if the total # of elements is the same
reshapedArray = np.reshape(new2dArray, [1,21])
print (reshapedArray.shape)

(3, 7) (7, 3)
(1, 21)


In [41]:
#mathematical operations : +, - , / , *  for element wise operations on 2 numpy arrays
#all arrays must be of same dimension
array1 = np.array([[1,2,3],[4,5,6]])
array2 = np.array([[1,2,3],[4,5,6]])
array3 = array1 + array2
array3

#instead of math operators, specific functions exist
# np.add np.subtract, np.divide, np.multiply

array([[ 2,  4,  6],
       [ 8, 10, 12]])

In [42]:
#broadcasting is a way to do math on different size arrays
#must align at least along one dimension

array4 =[[1,4,7]]
array1 + array4

array([[ 2,  6, 10],
       [ 5,  9, 13]])

In [45]:
# important operation : inner product of 2 arrays
# or matrix multiplication ( dot product of every row of the first matrix
# with every column of the right side matrix for all rows )
# dot product = inner product
# number of left matrix rows = number of right matrix columns for this to work
# .T short hand for transpose
np.dot(array1, array2.T)


array([[14, 32],
       [32, 77]])

In [46]:
# You can add / multiply all elements along 1 dimension ( axis)
# similar to compressing array along 1 axis
#rows
np.sum(array1, axis=0)

array([5, 7, 9])

In [47]:
#columns
np.sum(array1, axis=1)

array([ 6, 15])

In [51]:
#merging arrays is done via hstack or vstack, these take in
# a list or tuple array of arrays then stacks the mvertically
arrayNew = np.vstack((array1, array2))



array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [52]:
#Scipy has many modules that help compute mathematical functions
#distance between 2 points, derivatives, integrals,
#spatial module given 2 points we can find 
# could be eucledian distance, cosine distance, etc, correlation , hamming
#check docs

from scipy.spatial.distance import correlation, cosine, pdist, squareform

In [53]:
array1=np.array([0,1,0])
array2=np.array([1,0,0])
correlation(array1, array2)

1.5

In [55]:
#computes pairwise distance between all rows of allPoints
# d will be square matrix with d[i,j] being the Euclidean dist
#between allpoints[i,:] and allpoints[j,:]
# documentation found in Scipy doc

allPoints =np.vstack([array1,array2])
d=squareform(pdist(allPoints, 'euclidean'))


print(d)

[[ 0.          1.41421356]
 [ 1.41421356  0.        ]]


In [1]:
# check for subString in stringVar
stringVar = 'darkgrey'
if 'grey' in stringVar:
    print (True)

True


In [4]:
# Regular expressions / Regex / Re
# check for 'grey' or 'white 
# use pipe 'grey|white'
# () used to separate or group
# example 'gr(a|e)y' gray or grey
# search for repeating patterns and number of pattern repeats
# with ?, *, +, {n}, {min,max} - must be consecutive repeats
# ? matches 0 or 1 occurences of the previous element
# colou?r -> color or color
# * matches 0 or more appearances
# colou*r -> color, colour colouur, etc
# + similar to star, but atleast 1 occurence cant be 0 occurences
# {n} exactly n times
# 'colou{2}r' -> colouur
# {n,} -> at least 2 times
# {m,n} -> at least m at most n occurences
# specify which set of characters can be matched
# [bhc]at -> bat, hat, cat
# [a-z] -> a,b,c,d,e,....z
# [a-z0-9] a-z or 0-9
# [a-cx-z] a,b,c,x,y,z
# regex for a word with 3 letters:
# '\s[a-z]{3}\s' \s is simply the space
#  can also specify which chars should not be patch
# [^bhc]at -> all words ending in at except bat,hat,cat
# any word that matches a hashtag '#[a-z0-9]+\s'
# find position where pattern occurs re.search()
# if exists it returns the first occurence(object) or none if no occurence

import re 

email = "tony@tiremove_thisger.net"
m = re.search("remove_this", email)
print (email[:m.start()])



tony@ti
tony@tiremove_this


In [5]:
#finding all the occurences
# re.findall() , re.finditer()
tweet = "#mondays #mondayblues I hate Mondays!"
#returns a list of strings
re.findall(r"#[a-z]+", tweet)

['#mondays', '#mondayblues']

In [29]:
# finditer() can be used to return objects
# and to know the positions as well as the text
# link to useful  https://www.machinelearningplus.com/python/python-regex-tutorial-examples/
# re.sub replaces  (whatToReplace, replaceWith, containingbjectOrString)

In [30]:
# Sentiment Analysis Implementation

In [None]:
# accept a user Search term, and return current 
# sentiment positive/ negative

