# Cleaning data with Regex and .apply() method
## Assignment 07
### Name: Matt Briskey

### Data Cleaning with RegEx

In [1]:
# load the data
import pandas as pd
import numpy as np

#Load data into the Address data frame
df = pd.read_csv('https://raw.githubusercontent.com/MattBriskey/data612_data_mining/main/addresses.csv', low_memory=False, encoding="ISO-8859-1")
df.head(10) #Shows the first 10 rows

Unnamed: 0,Address
0,"250 Rt 59\nAirmont, NY 10901"
1,"141 Washington Ave Extension\nAlbany, NY 12205"
2,"13858 Rt 31 W\nAlbion, NY 14411"
3,"2055 Niagara Falls Blvd\nAmherst, NY 14228"
4,"101 Sanford Farm Shpg Center\nAmsterdam, NY 12010"
5,"297 Grant Avenue\nAuburn, NY 13021"
6,"4133 Veterans Memorial Drive\nBatavia, NY 14020"
7,"6265 Brockport Spencerport Rd\nBrockport, NY 1..."
8,"5399 W Genesse St\nCamillus, NY 13031"
9,"3191 County rd 10\nCanandaigua, NY 14424"


In [2]:
df.dtypes #Check the type of the data

Address    object
dtype: object

In [3]:
df.shape #Check the shape of the data

(192, 1)

In [4]:
# The above data is all in one cell.  It would be optimal to have the different address parts in separate cells.
# We can use Regex to clean the data and achieve this
# Used https://regex101.com/ and saved the below grouping here: https://regex101.com/r/xy9sG4/1


df['Street'] = df['Address'].str.extract('([\w ]+)', expand=True) #Extract the street from address
df['City'] = df['Address'].str.extract('\n([\w ]+)', expand=True)  #Extract the city from address
df['State'] = df['Address'].str.extract(', ([a-zA-Z]{2})', expand=True) #Extract the state from address
df['Zip'] = df['Address'].str.extract('(\d{5})?$', expand=True) #Extract the zip from address

df.head(5) #Show the first 5 rows

Unnamed: 0,Address,Street,City,State,Zip
0,"250 Rt 59\nAirmont, NY 10901",250 Rt 59,Airmont,NY,10901
1,"141 Washington Ave Extension\nAlbany, NY 12205",141 Washington Ave Extension,Albany,NY,12205
2,"13858 Rt 31 W\nAlbion, NY 14411",13858 Rt 31 W,Albion,NY,14411
3,"2055 Niagara Falls Blvd\nAmherst, NY 14228",2055 Niagara Falls Blvd,Amherst,NY,14228
4,"101 Sanford Farm Shpg Center\nAmsterdam, NY 12010",101 Sanford Farm Shpg Center,Amsterdam,NY,12010


### .apply() method

In [5]:
import pandas as pd

#Load Udemy Courses Data into the data frame
df2 = pd.read_csv('https://raw.githubusercontent.com/MattBriskey/data612_data_mining/main/Udemy%20Courses%20Data%202023%20-%20Sample.csv', low_memory=False, encoding="ISO-8859-1")
df2.head(10) #Shows the first 10 rows

Unnamed: 0,id,title,url,rating,num_reviews,num_published_lectures,duration
0,567828,The Complete Python Bootcamp From Zero to Hero...,/course/complete-python-bootcamp/,4.59,452973,155,22.0
1,1565838,The Complete 2023 Web Development Bootcamp,/course/the-complete-web-development-bootcamp/,4.67,263152,490,65.5
2,625204,The Web Developer Bootcamp 2023,/course/the-web-developer-bootcamp/,4.7,254711,616,64.0
3,756150,Angular - The Complete Guide (2023 Edition),/course/the-complete-guide-to-angular-2/,4.59,180257,472,34.5
4,2776760,100 Days of Code: The Complete Python Pro Boot...,/course/100-days-of-code/,4.7,177568,676,64.0
5,533682,Java Programming Masterclass updated to Java 17,/course/java-the-complete-java-developer-course/,4.55,177184,544,103.5
6,1362070,"React - The Complete Guide (incl Hooks, React ...",/course/react-the-complete-guide-incl-redux/,4.61,176452,512,48.5
7,851712,The Complete JavaScript Course 2023: From Zero...,/course/the-complete-javascript-course/,4.73,167670,320,69.0
8,950390,Machine Learning A-Zâ¢: Python & R in Data Sc...,/course/machinelearning/,4.53,166138,382,42.5
9,903744,Python for Data Science and Machine Learning B...,/course/python-for-data-science-and-machine-le...,4.62,126984,165,25.0


In [6]:
for col in df2.columns: #print the names of the columns
    print(col)

id
title
url
 rating 
num_reviews
num_published_lectures
duration


In [7]:
df2.rename(columns = {' rating ':'rating'}, inplace = True) #rename rating column so that it doesn't have leading or trailing spaces

In [8]:
#Create a function that returns the mean, sum, mode, median, and range

import numpy as np
from scipy import stats

def apply_function(col):
    print("Mean: ", round(np.mean(col),2))  #Display the mean rounded to 2 decimals
    print("Sum: ", round(np.sum(col),2))    #Display the sum rounded to 2 decimals
    print("Mode: ", stats.mode(col, keepdims = False))        #Display the mode
    print("Median: ", round(np.median(col),2))  #Display the median rounded to 2 decimals
    print("Range: ", round(np.max(col) - np.min(col),2))  #Display the range rounded to 2 decimals
df2[['rating']].apply(apply_function,axis=0)  #passes the rating column to the apply_function to 

Mean:  4.63
Sum:  129.76
Mode:  ModeResult(mode=4.59, count=4)
Median:  4.63
Range:  0.29


rating    None
dtype: object