In [1]:
#---Numerical Python Library (NumPy)---
import numpy as np
import math

In [2]:
#creating an array
a = np.array([1, 2, 3])
print(a)
#we can print the number of dimensions of a list using the ndim attribute
print(a.ndim)

[1 2 3]
1


In [3]:
#if we pass in a list of lists, we create a multi-dimensional array, for example, a matrix
b = np.array([[1, 2, 3],[4, 5, 6]])
b

array([[1, 2, 3],
       [4, 5, 6]])

In [4]:
#we can print out the length of each dimension by calling the shape attribute, which returns a tuple
b.shape

(2, 3)

In [5]:
#type of items in the array
a.dtype

dtype('int64')

In [6]:
#array with float values
c = np.array([2.2, 5, 1.1])
c.dtype.name

'float64'

In [7]:
c #to look at the data in the array

array([2.2, 5. , 1.1])

In [9]:
#example of two arrays with the same shape but with different filler values
d = np.zeros((2,3))
print(d)
print("\n")
e = np.ones((2,3))
print(e)

[[0. 0. 0.]
 [0. 0. 0.]]


[[1. 1. 1.]
 [1. 1. 1.]]


In [10]:
#array with random numbers
np.random.rand(2,3)

array([[0.49596332, 0.79952405, 0.24513102],
       [0.53412446, 0.20963588, 0.82200998]])

In [13]:
#an array of every even number from ten (inclusive) to fifty (exclusive)
f = np.arange(10, 50, 2)
f

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42,
       44, 46, 48])

In [14]:
#for a sequence of floats, we can use linspace. the third argument is the total number of items you want to generate
np.linspace(0, 2, 15) #15 numbers from 0 (inclusive) to 2 (inclusive)

array([0.        , 0.14285714, 0.28571429, 0.42857143, 0.57142857,
       0.71428571, 0.85714286, 1.        , 1.14285714, 1.28571429,
       1.42857143, 1.57142857, 1.71428571, 1.85714286, 2.        ])

In [15]:
#arithmic operators

a = np.array([10,20,30,40])
b = np.array([1,2,3,4])

c = a - b
print(c)

d = a * b
print(d)

[ 9 18 27 36]
[ 10  40  90 160]


In [18]:
#a temperature converter
farenheit = np.array([0, -10, -5, -15, 0])
#((f - 32) * 5/9 = c)
celcius = (farenheit - 32) * (5/9)
celcius

array([-17.77777778, -23.33333333, -20.55555556, -26.11111111,
       -17.77777778])

In [19]:
#boolean arrays
celcius > - 20

array([ True, False, False, False,  True])

In [20]:
#modulus operator to see if the numbers are even
celcius%2 == 0

array([False, False, False, False, False])

In [23]:
#matrix manipulation
A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])
print(A*B)
print("\n")
print(A@B) #for a matrix product, we use @ instead of *

[[2 0]
 [0 4]]


[[5 4]
 [3 4]]


In [24]:
A.shape

(2, 2)

In [26]:
#upcasting example

array1 = np.array([[1,2,3], [4,5,6]])
print(array1.dtype)

array2 = np.array([[7.1,8.2,9.1,], [10.4,11.2,12.3]])
print(array2.dtype)

int64
float64


In [28]:
array3 = array1+array2
print(array3)
print(array3.dtype)

[[ 8.1 10.2 12.1]
 [14.4 16.2 18.3]]
float64


In [29]:
print(array3.sum())
print(array3.max())
print(array3.min())
print(array3.mean())

79.3
18.3
8.1
13.216666666666667


In [30]:
b = np.arange(1,16,1).reshape(3,5)
print(b)

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]]


In [32]:
from PIL import Image #python image library
from IPython.display import display

In [None]:
#to look at the image 
im = Image.open('imagename.tiff')
display(im)

In [None]:
#to convert the image to a numpy array
array=np.array(im)
print(array.shape)
array

In [None]:
mask = np.full(array.shape, 255)
mask

In [None]:
#subtracting from the modified array
modified_array = array-mask

#converting all of the negativa values to positive
modified_array = modified_array * -1

#setting the value of the datatype correctly
modified_array = modified_array.astype(np.uint8)
modified_array

In [None]:
#to display the new array, we use fromarray to convert the numpy arrat into an object jupyter can render
display(Image.fromarray(modified_array))

In [None]:
#we can change the number of lines and columns if we like
reshaped = np.reshape(modified_array,(100,400))
print(reshaped.shape)
display(Image.fromarray(reshaped))

In [34]:
#indexing
a = np.array([1,3,5,7])
a[2]

5

In [35]:
#for multidimensional arrays, we use interger array idexing
a = np.array([[1,2], [3,4], [5,6]])
a

array([[1, 2],
       [3, 4],
       [5, 6]])

In [36]:
#to select one element, we enter the index that is comprised of two integers
a[1,1] #note: we start with 0 in python

4

In [37]:
#to get multiple elements into a one-dimensional array
np.array([a[0,0], a[1,1], a[2,1]])

array([1, 4, 6])

In [38]:
#another way of indexing
print(a[[0,1,2], [0,1,1]])

[1 4 6]


In [39]:
#boolean indexing
#this is used to select arbitrary elements based on conditions
print(a>5)

[[False False]
 [False False]
 [False  True]]


In [40]:
print(a[a>5]) #in this, the operation a>5 is masked over the original array to return a one-dimensional array relating to the true values

[6]


In [42]:
#slicing
#a way to create a sub-array based on the original.
a = np.array([0,1,2,3,4,5])
print(a[:3])

[0 1 2]


In [43]:
print(a[2:4])

[2 3]


In [44]:
#for multi-dimensional arrays
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
a

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [45]:
a[:2]

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [46]:
a[:2, 1:3] #first argument for rows, second for columns

array([[2, 3],
       [6, 7]])

In [47]:
#changing the element at position [0,0], which is 2, to 50
sub_array = a[:2, 1:3]
print("Sub array index [0,0] value before change: ", sub_array[0,0])
sub_array[0,0] = 50
print("Sub array index [0,0] value after change: ", sub_array[0,0])
print("Original array index [0,0] value after change: ", a[0,1])

Sub array index [0,0] value before change:  2
Sub array index [0,0] value after change:  50
Original array index [0,0] value after change:  50


In [None]:
#Trying numpy with datasets
#we're going to use a popular dataset on wine quality. The dataset includes: fixed acidity, volatile aciditycitric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxidensirt, pH, sulphates, alcohol, quality
#to load it, we use genfromtxt
#it has a parameter called dtype to spicify data type. If it isn't specified, all types will be the same to the most general or precise

wines = np.genfromtxt("datasets/winequality-red.csv", delimiter=";", skip_header=1)
wines

In [None]:
#we're going to use integer indexing to get a certain column or row.
#first column of all of the combined rows
print("One integer 0 for slicing: ", wines[:, 0])
#to preserve the own rows of the same values
print("0 to 1 for slicing: \n", wines[:, 0:1]

In [None]:
#to range the columns in order
wines[:, 0:3]

In [None]:
#for several non-consecutive columns
wines[:, [0,2,4]]

In [None]:
#to find out the average quality of red wine, we select the quality column with -1 (because it's the last one)
wines[:,-1].mean()

In [None]:
#another dataset based on admissions
graduate_admission = np.genfromtxt('datasets/Admission_Predict.csv', dtype=None, delimiter=',', skip_header=1,
                                  names=('Serial No', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
                                         'LOR', 'CGPA', 'Research', 'Chance of Admit'))
graduate_admission

In [None]:
#the resulting array is a one-dimensional array with 400 tuples
graduate_admission.shape

In [None]:
#we can retrieve a column using its name
graduate_admission['CGPA'][0:5]

In [None]:
#this is only to use a USA scale of up to 4
graduate_admission['CGPA'] = graduate_admission['CGPA'] / 10 * 4
graduate_admission['CGPA'][0:20]

In [None]:
#using boolean masking for research experience
len(graduate_admission[graduate_admission['Research'] == 1])

In [None]:
#to see if students with a high chance of admission on average have higher GRE scores than those with a lower chance
print(graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]['GRE_Score'].mean())
print(graduate_admission[graduate_admission['Chance_of_Admit'] < 0.4]['GRE_Score'].mean())
print(graduate_admission[graduate_admission['Chance_of_Admit'] < 0.4]['GRE_Score'].mean())

In [None]:
graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]

In [None]:
#same thing but with GPA
print(graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]['CGPA'].mean())
print(graduate_admission[graduate_admission['Chance_of_Admit'] < 0.4]['CGPA'].mean())

In [48]:
#---Manipulating Text with Regular Expression---
import re

In [50]:
text = "This is a good day."

if re.search("good", text):
    print("Nice")
else:
    print("Too bad")

Nice


In [51]:
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

re.split("Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [52]:
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [53]:
#Anchors
#Anchors specify the start and/or the end of the string that you are trying to match. the caret character means start and $ end

text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

re.search("^Amy", text) #it will return a re.match object
#a re.match object always has a boolean value of true, in this case Amy

<re.Match object; span=(0, 3), match='Amy'>

In [54]:
#patterns and character classes
grades="ACAAAABCBCBAA"
re.findall("B",grades)

['B', 'B', 'B']

In [55]:
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']

In [56]:
re.findall("[A][B-C]",grades)

['AC', 'AB']

In [57]:
re.findall("AB|AC", grades)

['AC', 'AB']

In [58]:
re.findall("[^A]", grades) #not a's

['C', 'B', 'C', 'B', 'C', 'B']

In [59]:
#this will print an empty list, because the regex says that we want to match any value at the beginning of the string
#which is not an A. Out String though starts with an A, so there is no match found
re.findall("^[^A]", grades)

[]

In [60]:
#Quantifiers
#The number of times you want a pattern to be matched in order to match.
re.findall("A{2,10}",grades)

['AAAA', 'AA']

In [61]:
re.findall("A{1,1}A{1,1}",grades)

['AA', 'AA', 'AA']

In [62]:
#the regex quantifier syntax does not allow you to deviate from the {m,n}
#pattern. In particular if you have an extra space in between the braces you'll get an empty result
re.findall("A{2, 2}",grades)

[]

In [63]:
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [64]:
#instead of {2,2}, we can use {2}
re.findall("A{2}",grades)

['AA', 'AA', 'AA']

In [65]:
#for a decreasing trend
re.findall("A{1,10}B{1,10}C{1,10}",grades)

['AAAABC']

In [None]:
#a wikipedia file
with open("datasets/ferpa.txt", "r") as file:
    wiki=file.read()
wiki

In [None]:
re.findall("[a-zA-Z]{1,100}\[edit\]",wiki) #for a list of all headers

In [None]:
#to improve this, we can use \w
re.findall("[\w]{1,100}\[edit\]",wiki) #this indicates a special pattern of any letter or digit

In [None]:
re.findall("[\w]*\[edit\]",wiki) # * to match 0 or more times

In [None]:
#to improve it, we can add in a spaces using the space character
re.findall("[\w ]*\[edit\]",wiki)

In [None]:
for title in re.findall("[\w ]*\[edit\]",wiki):
    print(re.split("[\[]",title)[0])

In [None]:
#Groups
#matches of different patterns
re.findall("([\w]*(\[edit\])",wiki)

In [None]:
#we can refer to groups by number
for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.groups())

In [None]:
for item in re.finditer("([\w ]*)(\[edit\])",wiki):
    print(item.group(1))

In [None]:
#we can also label groups
#(?P<name>) ?P indicates that this is an extension to basic regexes
#<name> is the dictionary key we want to use
for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])",wiki):
    print(item.groupdict()['title'])

In [None]:
print(item.groupdict())

In [None]:
#look-ahead and look-behind
#the pattern being given to the regex engine is for text either before or after the text we are trying to isolate
for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wiki): #matching two groups
    print(item)

In [None]:
#an example with wikipedia data
#data on universities in the us which are buddhist-based
with open("datasets/buddhist.txt","r") as file:
    wiki=file.read()
wiki

In [None]:
#verbose mode of python regexes
pattern="""
(?P<title>.*)     #the university title
(-\ located\ in\) #an indicator of the location
(?P<city>\w*)     #city the university is in
(,\ )             #separator for the state
(?P<state>\w*)    #the state the city is located in"""

for item in re.finditer(pattern,wiki,re.VERBOSE):
    print(item.groupdict())

In [None]:
#an example on a new york times article that covers health tweets, based on hashtags
with open("datasets/nytimeshealth.txt","r") as file:
    health=file.read()
health

In [None]:
pattern = '#[\w\d]*(?=\s)' #the ending represents whitespace

#searching and displaying all of the hashtags
re.findall(pattern, health)

In [1]:
#---Regex Practice Session---
#previous practice examples
#e-mail verifier
import re
#we want to look for uppercase letters [A-Z], lowecase letters [a-z] and numbers [0-9]
#+ means we are looking for any combinations of one or more of these characters
#we look for a dot with \.
pattern = "[a-zA-Z0-9]+@[a-zA-Z]+\.(com|edu|net)"
user_input = input()
if(re.search(pattern, user_input)):
    print("valid email")
else:
    print("invalid email")

pablo@hotmail.com
valid email


In [2]:
#replacing a specific part of a string
import re
pattern = "(\d\d\d)-(\d\d\d)-(\d\d\d\d)" #each set of () resembles a group
new_pattern = r"\1\2\3" #from left to right, groups 1,2 and 3, the r interprets it as a raw string
user_input = input()
new_user_input = re.sub(pattern, new_pattern, user_input) #replace the hyphens with whitespace
print(new_user_input)

145-879-8653
1458798653


In [None]:
#to treat abc as a group
(?:abc)

^Z-.*: [a-z.]+ #for the following: Z-ADMIN-Username: k.szdw Z-Wolverine-Username: x

\w{2} \d{5}(\(\d{4}\)|-\d{4}) #for the following: MI 12345-6789 MI 12345(6789)



In [None]:
#What is the correct regular expression to match a URL with letters, 
#numbers, underscores and dots? A valid URL defined in this problem must meet the following requirements:

#The URL consists of two or more strings made of letters, numbers, and underscores.
#A dot is used in between the strings.
#No two dots are allowed to appear consecutively.
#For example, your regex should match URLs like: www.aBC.com, abc.com, ab_c.de8f.com
#But your regex should not match: abc, abc..com
(\w+\.)+\w+

In [None]:
#What is the correct regular expression to match an ISBN number from two publishers 
#(World Scientific from Singapore, and Sigma Publications from Greece)? 
#A valid ISBN code defined in this problem must meet the following requirements:

#The ISBN number consists of 10 digits, with dashes(-) in between.
#The ISBN number must match the patterns of one of the following publishers(x means a digit from 0 to 9): for World Scientific, the pattern should be xxxx-x-xxxx-x, and for Sigma Publications, the pattern should be xxx-xxx-xxx-x.
#For example, your regex should match ISBNs like: 9971-5-0210-0, 960-425-059-0

\d{4}-\d-\d{4}-\d|\d{3}-\d{3}-\d{3}-\d

In [None]:
#What is the correct regular expression to match a DOI registered by Crossref? A valid DOI(e.g. doi:10.1038/nphys1170) defined in this problem must meet the following requirements:

#The DOI starts with doi:
#The link has two parts divided by a “/”. In the first part, there can only be numbers and dots, and in the second part, there can be any characters. There should be at least one character in each part.
#For example, your regex should match DOIs like: doi:10.1038/nphys1170, doi:10.1002/0470841559.ch1

doi:[\d.]+/.+