In [1]:
# import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
titanic = pd.read_csv('Datasets/titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S


In [4]:
titanic.Name

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

## the `str` accessor in pandas
Similar to datetime objects in the time series tutorial having a `dt` accessor, a number of specialized string methods are available when using the `str` accessor. These methods have in general matching names with the equivalent built-in string methods for single elements, but are applied element-wise on each of the values of the columns.

The `str` accessor in pandas provides a wide range of string methods that allow for efficient and convenient text processing on an entire Series of strings. Here are some commonly used str methods:

* String splitting: `str.split()`
* String joining: `str.join()`
* Substrings: `str.slice(start, stop)`, `str[0]`
* String Case Conversion: `str.lower()`, `str.upper()`, `str.capitalize()`
* Whitespace Removal: `str.strip()`, `str.lstrip()`, `str.rstrip()`
* Replacing and Removing: `str.replace('old', 'new')`
* Pattern matching and extraction: `str.contains('pattern')`, `startswith('prefix')`, `endswith('suffix')`
* String length and counting: `str.len()`, `str.count()`

In [60]:
titanic["Name"]

0                                  Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Thayer)
2                                   Heikkinen, Miss. Laina
3             Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                                 Allen, Mr. William Henry
                              ...                         
886                                  Montvila, Rev. Juozas
887                           Graham, Miss. Margaret Edith
888               Johnston, Miss. Catherine Helen "Carrie"
889                                  Behr, Mr. Karl Howell
890                                    Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [61]:
titanic["Name"].str.len()

0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

Let's see which of the passenger has the longest name?

In [62]:
titanic["Name"].str.len().idxmax()

307

In [63]:
titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]

'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

In [64]:
# get the observations that contains the word 'Mrs'
titanic[titanic.Name.str.contains('Mrs.')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,,C
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",55.0,0,0,248706,16.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",47.0,1,1,11751,52.5542,D35,S
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",28.0,1,0,P/PP 3381,24.0000,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",25.0,0,1,230433,26.0000,,S


In [65]:
# get the observations that contains the word 'Mrs'
titanic.Name.str.count('Mrs.').sum()


np.int64(129)

In [67]:
titanic.Name.str.split(',', expand=True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [14]:
titanic.Name.str.split(',', expand=True).get(1)

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: 1, Length: 891, dtype: object

Create a new column `Title` that contains the title of the passengers

In [15]:
# from the name, extract the title
titanic['Title'] = titanic.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
titanic['Title']

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Title, Length: 891, dtype: object

In [16]:
# get the unique titles
titanic.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

#### let's create a mapping dictionary

In [None]:
title_sex_mapping = {
    'Mr': 'Male',
    'Mrs': 'Female',
    'Miss': 'Female',
    'Master': 'Male',
    'Don': 'Male',
    'Rev': 'Male',
    'Dr': 'Male',  # Assumed to be Male unless you have additional context
    'Mme': 'Female',
    'Ms': 'Female',
    'Major': 'Male',
    'Lady': 'Female',
    'Sir': 'Male',
    'Mlle': 'Female',
    'Col': 'Male',
    'Capt': 'Male',
    'the Countess': 'Female',
    'Jonkheer': 'Male'
}

In [18]:
titanic['Sex'] = titanic['Title'].map(title_sex_mapping)

In [19]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Sex
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,Mr,Male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Female
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,Mrs,Female
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,Mr,Male


In [25]:
titanic.Sex.value_counts()

Sex
Male      578
Female    313
Name: count, dtype: int64

In [26]:
# cross tabulation
pd.crosstab(titanic.Survived, titanic.Sex)

Sex,Female,Male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,232,110


In the Titanic disaster, more women survived than men due to the social norms and evacuation protocols followed during the sinking. The principle of "women and children first" was enforced when lifeboats were being loaded. Since there were not enough lifeboats for everyone on board, priority was given to women and children, which contributed to the higher survival rate among females compared to males.

## the `re` module in Python is used for regular expression

The `re` module in Python is used for **regular expressions**, which are powerful tools for text analysis and manipulation. Regular expressions allow you to search, match, and manipulate strings based on specific patterns.

Common Use Cases of `re` in String Text Analysis

* Finding patterns in text: `re.search(r'\d{4}-\d{2}-\d{2}', text)`   # searches for a date in the format YYYY-MM-DD
* Extracting Specific parts of a string: `re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)`  #extracts email addresses from the text.
* Replacing Parts of a String: `re.sub(r'\$\d+', '[price]', text)`    #replacing any price formatted as $<number> with [price].

### Commonly Used Patterns in `re`

- **`\d`**: Matches any digit (0-9).
- **`\w`**: Matches any alphanumeric character (letters and numbers).
- **`\s`**: Matches any whitespace character (spaces, tabs, newlines).
- **`[a-z]`**: Matches any lowercase letter from `a` to `z`.
- **`[A-Z]`**: Matches any uppercase letter from `A` to `Z`.
- **`*`**: Matches 0 or more occurrences of the preceding character.
- **`+`**: Matches 1 or more occurrences of the preceding character.
- **`?`**: Matches 0 or 1 occurrence of the preceding character.
- **`^`**: Matches the beginning of a string.
- **`$`**: Matches the end of a string.
- **`|`**: Acts as an OR operator.


### Chicago Bulls Seasons — Dataset 

**Source:** Wikipedia — *List of Chicago Bulls seasons*

**Coverage:** 1966–67 (inaugural season) through the most recent season listed (e.g., 2024–25).

**Unit of observation:** One row per Bulls season.


In [17]:
# read the CSV file
ChicagoBulls = pd.read_csv('Datasets/ChicagoBulls.csv')
ChicagoBulls.head()

Unnamed: 0,Season,Team,Conference,Conference Finish,Division,Division Finish,Wins,Losses,Win%,GB,Playoffs,Awards,Head coach
0,1966–67,1966–67,—,—,Western,4th,33,48,0.407,11,Lost Division semifinals (Hawks) 3–0[19],Johnny Kerr (COY)[6],Johnny Kerr
1,1967–68,1967–68,—,—,Western,4th,29,53,0.354,27,Lost Division semifinals (Lakers) 4–1[20],—,Johnny Kerr
2,1968–69,1968–69,—,—,Western,5th,33,49,0.402,22,,—,Dick Motta
3,1969–70,1969–70,—,—,Western,3rd[c],39,43,0.476,9,Lost Division semifinals (Hawks) 4–1[22],—,Dick Motta
4,1970–71,1970–71,Western,3rd,Midwest[d],2nd,51,31,0.622,2,Lost conference semifinals (Lakers) 4–3[23],Dick Motta (COY)[6],Dick Motta


####  Primary columns
- **Season** — Season label (e.g., `1995–96`).
- **Team** — Franchise name (Chicago Bulls).
- **Conference** — Western/Eastern (historical realignments reflected).
- **Division** — Division membership for that season (e.g., Midwest, Central).
- **Finish** — Standing within conference and/or division.
- **Wins**, **Losses**, **Win%** — Regular-season record and win percentage.
- **GB** — “Games Behind” the division leader.
- **Playoffs** — Postseason result (round and opponent; champions/conference/division titles noted).
- **Awards** — Season awards using abbreviations.
- **Head coach** — Primary head coach for the season.

First, note that the square-bracketed bits (e.g., `[1]`, `[a]`, `[note 2]`, `[citation needed]`) are Wikipedia footnote/citation markers that often appear in scraped tables. We’ll remove any bracketed content—including the brackets themselves—from the **Division**, **Division Finish**, **Playoffs**, and **Awards** columns.

In [None]:
import re
def remove_brackets(x):
    return re.sub(r'\[.*?\]', '', x)

# Apply the function to each column separately using map
ChicagoBulls['Division'] = ChicagoBulls['Division'].map(remove_brackets)
ChicagoBulls['Finish'] = ChicagoBulls['Finish'].map(remove_brackets)
ChicagoBulls['Finish.1'] = ChicagoBulls['Finish.1'].map(remove_brackets)

ChicagoBulls.head()

Unnamed: 0,Season,Team,Conference,Finish,Division,Finish.1,Wins,Losses,Win%,GB,Playoffs,Awards,Head coach
0,1966–67,1966–67,—,—,Western,4th,33,48,0.407,11,Lost Division semifinals (Hawks) 3–0[19],Johnny Kerr (COY)[6],Johnny Kerr
1,1967–68,1967–68,—,—,Western,4th,29,53,0.354,27,Lost Division semifinals (Lakers) 4–1[20],—,Johnny Kerr
2,1968–69,1968–69,—,—,Western,5th,33,49,0.402,22,,—,Dick Motta
3,1969–70,1969–70,—,—,Western,3rd,39,43,0.476,9,Lost Division semifinals (Hawks) 4–1[22],—,Dick Motta
4,1970–71,1970–71,Western,3rd,Midwest,2nd,51,31,0.622,2,Lost conference semifinals (Lakers) 4–3[23],Dick Motta (COY)[6],Dick Motta


### Practice exercise 2 of Chapter 12

In [None]:
# Get GDP data with proper headers to avoid 403 Forbidden error
gdp_url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita"
gdp_response = requests.get(gdp_url, headers=headers)
gdp_response.raise_for_status()

gdp_data = pd.read_html(StringIO(gdp_response.text))[1]
gdp_data.head()

Unnamed: 0_level_0,Country/Territory,IMF[4][5],IMF[4][5],World Bank[6],World Bank[6],United Nations[7],United Nations[7]
Unnamed: 0_level_1,Country/Territory,Estimate,Year,Estimate,Year,Estimate,Year
0,Monaco,—,—,240862,2022,240535,2022
1,Liechtenstein,—,—,187267,2022,197268,2022
2,Luxembourg,135321,2024,128259,2023,125897,2022
3,Bermuda,—,—,123091,2022,117568,2022
4,Switzerland,106098,2024,99995,2023,93636,2022


In [29]:
# drop the year column
gdp_data.drop(columns=["Year"], level=1, inplace=True, axis=1)
gdp_data.head()

Unnamed: 0_level_0,Country/Territory,IMF[4][5],World Bank[6],United Nations[7]
Unnamed: 0_level_1,Country/Territory,Estimate,Estimate,Estimate
0,Monaco,—,240862,240535
1,Liechtenstein,—,187267,197268
2,Luxembourg,135321,128259,125897
3,Bermuda,—,123091,117568
4,Switzerland,106098,99995,93636


In [30]:
# drop the level 1 column
gdp_data = gdp_data.droplevel(1, axis=1)
gdp_data.head()

Unnamed: 0,Country/Territory,IMF[4][5],World Bank[6],United Nations[7]
0,Monaco,—,240862,240535
1,Liechtenstein,—,187267,197268
2,Luxembourg,135321,128259,125897
3,Bermuda,—,123091,117568
4,Switzerland,106098,99995,93636


In [31]:
import re

column_name_cleaner = lambda x:re.split(r'\[', x)[0]

gdp_data.columns = gdp_data.columns.map(column_name_cleaner)

gdp_data.head()

Unnamed: 0,Country/Territory,IMF,World Bank,United Nations
0,Monaco,—,240862,240535
1,Liechtenstein,—,187267,197268
2,Luxembourg,135321,128259,125897
3,Bermuda,—,123091,117568
4,Switzerland,106098,99995,93636


## `NLTK` Library for NLP

NLTK (Natural Language Toolkit) is a popular Python library for natural language processing (NLP) and text analysis. It provides a wide range of tools and resources for processing and analyzing human language data. NLTK is widely used in research, education, and industry for various text analysis tasks.

### Key Features and Capabilities of NLTK

* **Tokenization**: Splits text into individual words (word tokenization) or sentences (sentence tokenization).
* **Stop Word Removal**: Provides lists of common words (like "and", "the", "is") in various languages that can be removed from text to reduce noise.
* **Stemming**: Reduces words to their root form (e.g., "running" to "run").
* **Lemmatization**: Similar to stemming, but more sophisticated. It reduces words to their dictionary form using vocabulary and morphological analysis (e.g., "better" to "good").

In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
nltk.download('punkt_tab')

text = "This is a sentence. Here's another one!"
words = word_tokenize(text)
sentences = sent_tokenize(text)

print(words)
print(sentences)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lsi8012\AppData\Roaming\nltk_data...


['This', 'is', 'a', 'sentence', '.', 'Here', "'s", 'another', 'one', '!']
['This is a sentence.', "Here's another one!"]


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

print(len(stop_words))
print(stop_words)


179
{'when', "didn't", 'don', 'into', 'ma', 'my', 'very', 'her', 'should', "couldn't", 's', 'll', "should've", "hasn't", 'until', 'the', 'through', 'myself', 'hadn', 'off', "it's", "wouldn't", 'we', 'have', 'of', 'not', 'in', 'ourselves', 't', 'whom', 'with', 'at', "shan't", 'other', 'our', 'does', 'such', 'am', 'they', 'no', 'between', 'out', 'you', 'your', "doesn't", 'under', 'theirs', 'than', "you'll", 'won', 'i', 'wasn', 'me', 'm', 'to', 'mightn', "won't", "isn't", 'what', "shouldn't", "that'll", 'ours', 'how', 'yourselves', 'having', 'yourself', 'yours', 'only', 'did', 'has', 'all', 'him', 'hers', 'some', 'so', 'will', 'its', 'there', 'against', "hadn't", 'were', 'then', 'again', 'y', 'doesn', 'it', 'o', 'if', "wasn't", 'on', 'be', 'while', 'who', "weren't", 've', 'aren', "you've", 'further', 'itself', 'do', 'by', 'isn', "you'd", 'because', 'below', 'that', 'for', 'is', 'down', 're', "don't", 'most', 'own', 'doing', 'now', 'herself', 'both', 'why', 'being', "she's", "mustn't", 'be

In [12]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_word = stemmer.stem("running") 

print(stemmed_word)


run


In [14]:
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_word = lemmatizer.lemmatize("running", pos='v')  

print(lemmatized_word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lsi8012\AppData\Roaming\nltk_data...


run


## Most common methods for Cleaning the Data

* Lowecasing the data
* Removing Puncuatations
* Removing Numbers
* Removing extra space
* Removing Stopping words
* Replacing the repetitions of punctations
* Removing Emojis
* Removing emoticons
* Removing Contractions

## Your final project

In [32]:
mcmf_df = pd.read_csv('../Final Project/My_CHI._My_Future._Programs.csv')
print(mcmf_df.shape)
mcmf_df.head()

(227746, 33)


Unnamed: 0,Program ID,Program Name,Description,Org Name,Category Name,Capacity,Min Age,Max Age,Address,City,...,Transport Provided,Has Free Food,Meeting Type,Hidden Programs,Featured,Custom Categories,Tag,Latitude,Longitude,Location
0,121806,English Conversation with Chicago Lawn,"The more we practice, the more we learn a new ...",Chicago Public Library,Academic Support,,18,99,,,...,False,False,online,False,False,,Event,,,
1,88726,Tattoos & Tribes,For millennia tattooing has served as a unique...,Chicago Public Library,Academic Support,0.0,14,18,,,...,False,False,online,False,False,,Event,,,
2,120797,Mi Libro y Yo Spanish Book Club for Kids,Acompae a Mrs. Vicky en el club de lectura pa...,Chicago Public Library,Reading & Writing.,0.0,4,10,,,...,False,False,online,False,False,,Event,,,
3,87932,Majorettes @Sarah Goode,Majorettes @Sarah Goode,After School Matters,Music & Art.,15.0,14,17,,,...,False,False,online,True,False,,Program,,,
4,88052,TeaProv Family,TeaProv Family,After School Matters,Music & Art.,15.0,14,22,,,...,False,False,online,True,False,,Program,,,


### Text Related Features

In [33]:
mcmf_df[['Program Name', 'Description', 'Org Name', 'Category Name']]

Unnamed: 0,Program Name,Description,Org Name,Category Name
0,English Conversation with Chicago Lawn,"The more we practice, the more we learn a new ...",Chicago Public Library,Academic Support
1,Tattoos & Tribes,For millennia tattooing has served as a unique...,Chicago Public Library,Academic Support
2,Mi Libro y Yo Spanish Book Club for Kids,Acompae a Mrs. Vicky en el club de lectura pa...,Chicago Public Library,Reading & Writing.
3,Majorettes @Sarah Goode,Majorettes @Sarah Goode,After School Matters,Music & Art.
4,TeaProv Family,TeaProv Family,After School Matters,Music & Art.
...,...,...,...,...
227741,Baseball Officiating at West Pullman,"Through this activity, participants will learn...",Chicago Park District,Sports + Wellness.
227742,Computer ABCs,Learn computer basics like creating word docum...,Chicago Public Library,Computers.
227743,Fun and Games (Special Rec) at Mann,For individuals with a primary intellectual or...,Chicago Park District,Sports + Wellness.
227744,Dance - Hip Hop at West Pullman,Learn current Hip Hop dances and develop techn...,Chicago Park District,Sports + Wellness.


In [34]:
print(mcmf_df['Org Name'].nunique())
mcmf_df['Org Name'].value_counts()

462


Org Name
Chicago Park District                                133258
Chicago Public Library                                85535
After School Matters                                    704
United States Tennis Association (USTA) - Midwest       494
SGA Youth & Family Services                             315
                                                      ...  
BLAC                                                      1
30th Ward Alderman's Office                               1
Loyola University                                         1
Oakton Community College                                  1
Chicago Botanic Garden                                    1
Name: count, Length: 462, dtype: int64

In [35]:
# extract the org names that have more than 10 programs
orgs = mcmf_df['Org Name'].value_counts()
len(orgs[orgs > 10])
orgs[orgs > 10]

Org Name
Chicago Park District                                133258
Chicago Public Library                                85535
After School Matters                                    704
United States Tennis Association (USTA) - Midwest       494
SGA Youth & Family Services                             315
                                                      ...  
Chicago DNC 2024                                         12
Concordia Place                                          12
Teen Innovators                                          12
Comp-U-Dopt                                              12
Chicago Learning Exchange (CLX)                          11
Name: count, Length: 150, dtype: int64

In [36]:
# remove the rows that org names that have less than 10 programs
mcmf_df = mcmf_df[mcmf_df['Org Name'].isin(orgs[orgs > 10].index)]
mcmf_df.head()

Unnamed: 0,Program ID,Program Name,Description,Org Name,Category Name,Capacity,Min Age,Max Age,Address,City,...,Transport Provided,Has Free Food,Meeting Type,Hidden Programs,Featured,Custom Categories,Tag,Latitude,Longitude,Location
0,121806,English Conversation with Chicago Lawn,"The more we practice, the more we learn a new ...",Chicago Public Library,Academic Support,,18,99,,,...,False,False,online,False,False,,Event,,,
1,88726,Tattoos & Tribes,For millennia tattooing has served as a unique...,Chicago Public Library,Academic Support,0.0,14,18,,,...,False,False,online,False,False,,Event,,,
2,120797,Mi Libro y Yo Spanish Book Club for Kids,Acompae a Mrs. Vicky en el club de lectura pa...,Chicago Public Library,Reading & Writing.,0.0,4,10,,,...,False,False,online,False,False,,Event,,,
3,87932,Majorettes @Sarah Goode,Majorettes @Sarah Goode,After School Matters,Music & Art.,15.0,14,17,,,...,False,False,online,True,False,,Program,,,
4,88052,TeaProv Family,TeaProv Family,After School Matters,Music & Art.,15.0,14,22,,,...,False,False,online,True,False,,Program,,,


In [46]:
print(mcmf_df['Category Name'].nunique())
mcmf_df['Category Name'].value_counts()

21


Category Name
Sports + Wellness.          102071
Music & Art.                 67678
Reading & Writing.           22511
Academic Support              7351
Science                       5973
Computers.                    4836
Building & Fixing Things      3983
Helping Your Community.       2037
Nature.                       1840
Performance.                  1803
Healthcare                    1596
Food.                         1304
Work + Career                 1164
Managing Money.               1148
Digital Media.                 640
Social Studies                 429
Customer/Human Service          91
Math                            47
Teaching                        43
Transportation                  20
Law                             16
Name: count, dtype: int64

In [43]:
# extract the category names that equal to 'Computers'
mcmf_df[mcmf_df['Category Name'] == 'Computers']

Unnamed: 0,Program ID,Program Name,Description,Org Name,Category Name,Capacity,Min Age,Max Age,Address,City,...,Transport Provided,Has Free Food,Meeting Type,Hidden Programs,Featured,Custom Categories,Tag,Latitude,Longitude,Location
4125,76728,CAC: Teen Fellows,<p><strong></strong><strong>The Chicago Archit...,Chicago Architecture Center,Computers,16.0,15,17,,,...,False,True,face_to_face,True,False,,Resource,,,


In [None]:
# set the category name to 'Computers.' for the rows that have category name equal to 'Computers'
mcmf_df.loc[mcmf_df['Category Name'] == 'Computers', 'Category Name'] = 'Computers.'
mcmf_df.loc[mcmf_df['Category Name'] == 'Science & Math', 'Category Name'] = 'Science'

In [47]:
print("category name #:", mcmf_df['Category Name'].nunique())
print("Program name #:", mcmf_df['Program Name'].nunique())

category name #: 21
Program name #: 30006


In [50]:
pd.set_option('display.max_colwidth', None)

mcmf_df['Program Name'].value_counts()

Program Name
Ice Skating - Freestyle Ice (Studio Rink) at McFetridge            2171
Story Time                                                         2163
Preschool Story Time                                               1963
Ice Skating - Freestyle Ice (Main Rink) at McFetridge              1581
Walk/Run (ASMG) - Open at Gately                                   1481
                                                                   ... 
Indigenous Approach to Mental Wellness                                1
Take Flight: Staying  The Course                                      1
Graphic Novel Book Club: The Best We Could Do                         1
Urban Book Discussion: The Game Never Ends                            1
Is that Love in the Air? Short Films with ReelAbilities Chicago       1
Name: count, Length: 30006, dtype: int64

In [51]:
# show the full content
pd.set_option('display.max_colwidth', None)
# extract the program names that have more than 100  characters
mcmf_df[mcmf_df['Program Name'].str.len() > 100]['Program Name']

907                Freedom to Read, Freedom to Create, Freedom to Inspire: Selections from CPLs One Book, Many Interpretations Exhibits
2374                    Homebuyer Education Series: Learn the Tips and Tricks of Homeowners Insurance and the Value of Home Inspections
3183                 Middle School Guía de la Feria de Oficios Especializados de CPS en Español - For pre-registered school groups only
4498                      Non-Fiction Book Discussion - Mad Enchantment: Claude Monet and the Painting of the Water Lilies by Ross King
4548      Overview of Chicago’s Minimum Wage Ordinance, Including Annual Increase and step 1 of the phase out of the Tipped Wage Credit
                                                                      ...                                                              
189802                           Returning to the Source: Black Teachers Centering Justice for Black Students in Chicago Public Schools
190666      Black Metropolis Research Consortium

In [52]:
# using : or  - to split the program name
mcmf_df['programName_before_separator'] = mcmf_df['Program Name'].str.split(r'[:\-]', n=1).str[0]

mcmf_df['programName_before_separator'].value_counts()

programName_before_separator
Ice Skating                             8905
Film Screening                          6189
Basketball                              3402
Park Kids                               2782
Tiny Tot Swim                           2393
                                        ... 
Karaoke Club                               1
After School Bingo                         1
Hot Cocoa with Santa at Valley Forge       1
Thanksgiving Stories                       1
Winter Break Camp at Bessemer              1
Name: count, Length: 21447, dtype: int64

In [53]:
# extract the programName_before_separator that has more than 50 programs
print(mcmf_df['programName_before_separator'].value_counts()[mcmf_df['programName_before_separator'].value_counts() > 50].nunique())

200


In [54]:
mcmf_df['programName_before_separator'].value_counts()[mcmf_df['programName_before_separator'].value_counts() > 50]

programName_before_separator
Ice Skating                    8905
Film Screening                 6189
Basketball                     3402
Park Kids                      2782
Tiny Tot Swim                  2393
                               ... 
Gymnastics Lev. 4                51
Foam Party                       51
Little Artists at Welles         51
Bitty Basketball at Nichols      51
Boxing at Ogden                  51
Name: count, Length: 485, dtype: int64

In [55]:
# group by the category name and programName_before_separator
mcmf_df.groupby(['Category Name', 'programName_before_separator']).size().sort_values(ascending=False)[:30]

Category Name       programName_before_separator  
Sports + Wellness.  Ice Skating                       8905
Music & Art.        Film Screening                    6171
Sports + Wellness.  Basketball                        3400
                    Tiny Tot Swim                     2393
Reading & Writing.  Story Time                        2171
                    Preschool Story Time              1967
                    Adult Book Discussion             1664
Sports + Wellness.  Walk/Run (ASMG)                   1478
Music & Art.        Dance                             1341
Reading & Writing.  Family Story Time                 1334
Sports + Wellness.  Park Kids                         1281
                    Pickleball                        1269
                    Track & Field (ASMG)              1260
                    Tennis Lessons                    1152
                    Stick & Puck at Morgan Park SC    1130
                    Day Camp (ages 6                  1011
Acade

In [56]:
mcmf_df['programName_before_separator'].value_counts()[mcmf_df['programName_before_separator'].value_counts() > 1000]

programName_before_separator
Ice Skating                       8905
Film Screening                    6189
Basketball                        3402
Park Kids                         2782
Tiny Tot Swim                     2393
Story Time                        2171
Preschool Story Time              1972
Adult Book Discussion             1666
Dance                             1539
Walk/Run (ASMG)                   1481
Family Story Time                 1334
Pickleball                        1269
Track & Field (ASMG)              1260
Tennis Lessons                    1152
Stick & Puck at Morgan Park SC    1130
Day Camp (ages 6                  1011
Name: count, dtype: int64

Feel free to explore further!!