# Setup and import libraries

In [143]:
# Setup code and import libraries
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re as re
import matplotlib.pyplot as plt
#* from matplotlib import pyplot as plt

np.random.seed(12345) 
plt.rc('figure', figsize=(10, 6)) # adjusts the default figure size for Matplotlib plots
np.set_printoptions(precision=4, suppress=True) 
pd.options.display.max_rows = 6 

'''from bs4 import BeautifulSoup 
imports the BeautifulSoup class from the bs4 module, which is part of the Beautiful Soup library. Beautiful Soup is a Python library used for web scraping and parsing HTML and XML documents.'''

'''import re
This imports the regular expression module, which provides support for working with regular expressions in Python. Regular expressions are used for pattern matching and string manipulation.'''

'''np.random.seed(12345)
When you set a random seed, subsequent calls to random number generation functions will produce the same sequence of random numbers.'''

'''np.set_printoptions(precision=4, suppress=True) 
precision=4 sets the precision for floating-point numbers to 4 decimal places 

suppress=4 suppresses the printing of small floating-point values in scientific notation. When set to True, NumPy will print floating-point numbers in fixed-point notation instead of scientific notation.
'''

'''display.max_rows = 6 
sets the maximum number of rows displayed when printing Pandas DataFrame or Series objects to 6 rows.'''

'display.max_rows = 6 \nsets the maximum number of rows displayed when printing Pandas DataFrame or Series objects to 6 rows.'

In [10]:
import requests # The requests library is commonly used for making HTTP requests in Python
import io

# get data from the website link i.e make a GET request
r = requests.get('https://raw.githubusercontent.com/tidyverse/tidyr/master/data-raw/relig_income.csv')

print(r) 

snippet = pd.read_csv(filepath_or_buffer=io.StringIO(r.text))

snippet

# r.text: This is the attribute of the response object r obtained from the requests.get() method. It contains the text content of the response received from the website. In this case, the text content is assumed to be in CSV format.

# io.StringIO() Allows the text content(r.text) to be treated as a file-like object, that can be passed to functions expecting a file-like input

#* By using io.StringIO(r.text) as the source for reading the CSV data, we are effectively treating the text content obtained from the HTTP response as if it were a file, allowing us to parse it using pd.read_csv() without having to save it to a physical file first.


<Response [200]>


Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
...,...,...,...,...,...,...,...,...,...,...,...
15,Other Faiths,20,33,40,46,49,63,46,40,41,71
16,Other World Religions,5,2,3,4,2,7,3,4,4,8
17,Unaffiliated,217,299,374,365,341,528,407,321,258,597


# pd.melt()

In [35]:
pd.melt(snippet, id_vars='religion', var_name='income', value_name='count')

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
...,...,...,...
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8
179,Unaffiliated,Don't know/refused,597


# Regular expressions

| Special Characters | Explanation | Example of matches |
| --- | --- | --- |
| `.` | This character will match against anything. It is essentially a wild card. By default, Python doesn't include new lines. | The regex `".a"` will match the following. `"aa"`, `"Aa"`, `"ba"`, `"Ba"`, `"ca"`, `"da"`, etc. But, it will also match strings like `"aaa"`, `"Aat"`, `"taa"` because the regex is matching for *substrings*.|
|`^` | Matches the start of the string. | The regex `"^s.a"` will match the following. `"saa"`, `"sAa"`, `"sba"`, `"sBa"`, `"sca"`, `"sda"`, etc. But, it will not match `"asda"` because the string starts with `"a"`. |
| `$` | Matches the end of the string or just before the newline at the end of the string. |The regex `"$a"` will match the following. `"aa"`, `"Aa"`, `"ba"`, `"Ba"`, `"ca"`, `"da"`, etc. |
| `?` | Matches 0 or 1 occurrences of a string. | The regex `"columns?"` will match both `"column"` and its plural, `"columns"`. |
| `*` | Matches 0 or more occurrences of a string. | The regex `"11*"` will match `"1"` or any consecutive sequences of ones. |
| `\` | Escapes special characters, allowing them to be used for matching. | `"."` matches any character, but `"\."` matches any string containing a period. |
| &#124; | Matches either the first or the second character, but not both nor neither. | `"^a&#124;b$"` only matches the strings `"a"` and `"b"`. |
|`(...)`| Matches the substring as a whole. | The regex "(1 &#124; 0)*" matches any string containing a consecutive binary substring  of the same number or empty string. |
| `[...]` | Allows matching only on characters specified.  | `"[01]*"` matches all binary strings. The repetition (`*`) doesn't reapply on a fixed matched string but on the pattern. |

There are more operators present in the `re` package \cite{Kuchling2018}, but the above operations are present in almost all packages independent of implementation and programming language.

## Example 1: re.split()

In [72]:
wsRegex = re.compile("\s+") # The + means one or more matches. same as '"\s\s*"'

wsRegex = re.compile("\s\s*")
# This string does not use a raw string literal or escape sequences for the backslash. \s represents a whitespace character, and \s* represents zero or more whitespace characters.

wsRegex

  wsRegex = re.compile("\s+") # the plus just means one or more matches. same as '"\s\s*"'
  wsRegex = re.compile("\s\s*")


re.compile(r'\s\s*', re.UNICODE)

''' \s is a special sequence in regular expressions, not in Python string literals. It represents a whitespace character (such as spaces, tabs, and newlines). To avoid this warning, you can either use a raw string literal by prefixing the string with r, which tells Python not to interpret backslashes as escape characters, or you can double the backslashes to escape them properly within the regular expression pattern.'''

'''re.compile("\s+") returns a regular expression object compiled from the pattern "\s+", which matches one or more whitespace characters. This compiled regular expression object can then be used to perform various operations such as searching, matching, and replacing text based on the defined pattern.'''

In [73]:
wsRegex = re.compile(r"\s+") # raw string literal r"\s+" treats backslashes as literal characters

wsRegex = re.compile("\\s+") # \\ is needed to represent a single backslash \

wsRegex

re.compile(r'\s+', re.UNICODE)

In [89]:
x = "   Hello, World, How are you, today?   "

print(wsRegex.split(x)) # separates on substring matches

# An alternative is to call re.split('\s+', x), where the regular expression is first compiled, then split() is called on the passed text. In our example we chose to compile the regex with re.compile('\s+'), which returns a reusable regex object for faster matching throughput. As such, creating a regex object with re.compile is highly recommended. When applying the same expression on many strings, CPU cycles will be saved from the compilation.

#* Using a pre-compiled regular expression object is more efficient

print(re.split('\\s+', x))

print(x.split()) # discard empty strings from the result

print(x.split(' ')) # separates on character matches


['', 'Hello,', 'World,', 'How', 'are', 'you,', 'today?', '']
['', 'Hello,', 'World,', 'How', 'are', 'you,', 'today?', '']
['Hello,', 'World,', 'How', 'are', 'you,', 'today?']
['', '', '', 'Hello,', 'World,', 'How', 'are', 'you,', 'today?', '', '', '']


## Example 2: re.findall()

In [142]:
x = "IoOo00oOoO0OioIolOoOoiolOO1OOoloo|Ooo0IiooO|1"

rex1 = re.compile('o')

rex2 = re.compile('o+') # matches one or more occurrences of the letter 'o' consecutively

rex3 = re.compile('[oO]') 

rex4 = re.compile('[oO]+') # I think: 'o' or 'O' occurrences, consecutive or non consecutively

# re.findall() # Return a list of all non-overlapping matches in the string.

print(rex1.findall(x))

print(rex2.findall(x)) 

print(rex3.findall(x))

print(rex4.findall(x))

['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']
['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'oo', 'oo', 'oo']
['o', 'O', 'o', 'o', 'O', 'o', 'O', 'O', 'o', 'o', 'O', 'o', 'O', 'o', 'o', 'O', 'O', 'O', 'O', 'o', 'o', 'o', 'O', 'o', 'o', 'o', 'o', 'O']
['oOo', 'oOoO', 'O', 'o', 'o', 'OoOo', 'o', 'OO', 'OOo', 'oo', 'Ooo', 'ooO']


## Example 3: The span() method

The span() method(found in the re module) returns a tuple containing the start and end positions of the match. This method is typically used after performing a match using functions like match(), search(), or findall() from the re module.

In [141]:
rex4 = re.compile('[oO]+')

z = rex4.search(x) # Returns the first match and the index of the start and end of that match

print(z)

print(z.span()) 

<re.Match object; span=(1, 4), match='oOo'>
(1, 4)


## Example 4: re.sub()

In [146]:
x = "IoOo000OioIIoliol1l1|oo0Ii|1"

rex4 = re.compile('[oO]+')

# sub() replaces every occurrence of a pattern with a string or the result of a function
rex4.sub(string=x, repl=' , ') 

'I , 000 , i , II , li , l1l1| , 0Ii|1'

# Missing Data

In [None]:
dirtyDF = pd.read_csv(filepath_or_buffer= 'examples/ex4.csv', 
                      names=['message','a', 'b', 'c', 'd'],
                      index_col='message')

dirtyDF



In [None]:
cleanedDF = pd.read_csv(filepath_or_buffer = 'examples/ex4.csv',
                  engine='python',
                  sep=',',
                  header=0, # lines containing headers.
                  # Sometimes they can be on multiple lines due to formatting (numbering starts from 0)
                  names=['message', 'a', 'b', 'c', 'd'], # the names we want to use to index columns
                  index_col=['message'], 
                  skiprows=[0,2,3,6, 7],  # we don't skip any rows, just comments.
                                          # But useful when columns and data separated by whitespace or metadata,
                                          # such as line or page breaks'''
                  skipfooter=3,
                  na_values={'a': ['NaN'],
                             'b': ['NaN'],
                             'c': ['NaN'],
                             'd': ['NaN'],
                             'message': ['NA']} '''what values from the columns are actually considered NA or NaN
                                                   by pandas DataFrames.'''
                 )

In [None]:
cleanedDF.dropna() # Remove missing values.
cleanedDF.fillna() # Fill NA/NaN values using the specified method.
cleanedDF.notnull() # Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA.'''

cleanedDF['C'] = cleanedDF['c'].fillna(cleanedDF['c'].mean()) 

In [None]:
df6 = pd.read_excel("examples/random.numbers.xlsx", index_col=[0])

pd.concat([df6, df6], axis=0, keys=['f', 's']) 

'''keys=['f', 's'] 
Assigns the keys 'f' and 's' to the resulting concatenated DataFrames, which will be used to label the multi-level columns.'''

Imputation 

Imputation is the act of filling missing values with substituted values. However, it usually refers to filling missing data with representative values.

In [None]:
# randomly ordering
x = df6.sample(frac=1, replace=False)
y = df6.sample(frac=1, replace=False)

j1 = x.merge(y, left_index=True, right_index=True, suffixes=('_x', '_y'), sort=True)

'''
left_index : bool, default False
    Use the index from the left DataFrame as the join key(s). 

suffixes : list-like, default is ("_x", "_y")
    A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in left and right respectively.   

    sort : bool, default False
    Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword).  
'''

# Duplication


In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 14,
                     'k2': [0, 1, 0, 1, 1,
                            1, 0, 3, 1, 1,
                            0, 1, 3, 2, 1,
                            1, 0, 1, 1, 1,
                            3, 1, 2, 2, 1,
                            1, 0, 4]})
data

In [None]:
data.duplicated() # Return boolean Series denoting (fully)duplicate rows


In [None]:
data.drop_duplicates()


In [None]:
data.drop_duplicates(subset='k2') # specify a column to drop duplicates


# Functional mappings

In [159]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
...,...,...
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [161]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['grams'] = data['ounces'].map(lambda x: x*28.3495)

# map each food to the corresponding animal in the meat_to_animal dictionary
data['animal'] = data['food'].str.lower().map(meat_to_animal) 

# Converting all the strings in the 'food' columns to lowercase is done to ensure consistency because the keys in the meat_to_animal dictionary are all lowercase.
data

Unnamed: 0,food,ounces,grams,animal
0,bacon,4.0,113.3980,pig
1,pulled pork,3.0,85.0485,pig
2,bacon,12.0,340.1940,pig
...,...,...,...,...
6,pastrami,3.0,85.0485,cow
7,honey ham,5.0,141.7475,pig
8,nova lox,6.0,170.0970,salmon


In [162]:
data['ounces'].map(lambda x : x*28.3495)

0    113.3980
1     85.0485
2    340.1940
       ...   
6     85.0485
7    141.7475
8    170.0970
Name: ounces, Length: 9, dtype: float64

# fillna( )

In [164]:
df4 = pd.read_csv(filepath_or_buffer = 'examples/ex1.csv',
                  sep=',',
                  # lines containing headers.
                  # Sometimes they can be on multiple lines due to formatting (numbering starts from 0)
                  header=0, 

                  # the names we want to use to index columns
                  names=['a', 'b', 'c', 'd', 'message'], 

                  # what is indexing the columns?
                  index_col='message', 
                  
                  # we don't skip any rows, only comments.
                  # But this is useful when columns and data separated by whitespace or metadata.
                  # Or if page breaks are in the data.
                  skiprows=0, 
                  
                   # we grab all the rows
                  nrows=3,
                  
                  # what values from the files are actually not available or not expressible
                  na_values={'message': ['foo', 'NA'], 'a': ['1'], 'b': ['1'], 'c': ['1'], 'd': ['1']} 
                 )

FileNotFoundError: [Errno 2] No such file or directory: 'examples/ex1.csv'