In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

In [46]:
volunteer = pd.read_csv('volunteer_opportunities.csv')

In [40]:
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [41]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

In [57]:
# Check how many values are missing in the category_desc column
volunteer.isna().sum()

opportunity_id          0
content_id              0
vol_requests            0
event_time              0
title                   0
hits                    0
summary                 0
is_priority           603
category_id            48
category_desc          48
amsl                  665
amsl_unit             665
org_title               0
org_content_id          0
addresses_count         0
locality               70
region                  0
postalcode              6
primary_loc           665
display_url             0
recurrence_type         0
hours                   0
created_date            0
last_modified_date      0
start_date_date         0
end_date_date           0
status                  0
Latitude              665
Longitude             665
Community Board       665
Community Council     665
Census Tract          665
BIN                   665
BBL                   665
NTA                   665
dtype: int64

In [58]:
volunteer.category_desc.isna().sum()

48

In [62]:
# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer.category_desc.notnull()]

In [60]:
# Print out the shape of the subset
volunteer_subset.shape

(617, 35)

In [63]:
volunteer_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 617 entries, 1 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      617 non-null    int64  
 1   content_id          617 non-null    int64  
 2   vol_requests        617 non-null    int64  
 3   event_time          617 non-null    int64  
 4   title               617 non-null    object 
 5   hits                617 non-null    int64  
 6   summary             617 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           617 non-null    object 
 13  org_content_id      617 non-null    int64  
 14  addresses_count     617 non-null    int64  
 15  locality            552 non-null    object 
 16  region  

- Remember that you can use boolean indexing to effectively subset DataFrames.

In [43]:
# All three of these types are present in the DataFrame.
volunteer.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

In [34]:
volunteer.hits.head()

0    737
1     22
2     62
3     14
4     31
Name: hits, dtype: int64

In [35]:
# Use astype to convert between a variety of types.
volunteer["hits"] = volunteer['hits'].astype('int')

In [37]:
volunteer.hits.dtypes

dtype('int32')

In [67]:
# Both Emergency Prepardness and Environment occur less than 50 times
volunteer_subset.category_desc.value_counts()

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

In [68]:
volunteer_X = volunteer_subset.drop("category_desc", axis=1)
volunteer_X

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,amsl,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,,...,February 05 2011,approved,,,,,,,,
5,5056,37426,15,0,Queens Stop 'N' Swap,135,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,,...,February 12 2011,approved,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,5640,50193,3,0,Volunteer for NYLAG's Food Stamps Project,197,"Volunteers needed to file for fair hearings, d...",,2.0,,...,November 15 2012,approved,,,,,,,,
661,5218,38711,10,0,Iridescent Science Studio Open House Volunteers,113,Come out to the South Bronx to help us hold ou...,,1.0,,...,April 13 2011,approved,,,,,,,,
662,5541,47820,1,0,French Translator,145,Volunteer needed to translate written material...,,2.0,,...,September 01 2011,approved,,,,,,,,
663,5398,40722,2,0,Marketing & Advertising Volunteer,330,World Cares Center is looking for individuals ...,,1.0,,...,May 31 2012,approved,,,,,,,,


In [69]:
volunteer_y = volunteer_subset[["category_desc"]]
volunteer_y

Unnamed: 0,category_desc
1,Strengthening Communities
2,Strengthening Communities
3,Strengthening Communities
4,Environment
5,Environment
...,...
660,Helping Neighbors in Need
661,Strengthening Communities
662,Helping Neighbors in Need
663,Strengthening Communities


In [70]:
from sklearn.model_selection import train_test_split
# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

In [80]:
y_train["category_desc"].value_counts()

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64

In [None]:
# Standardization is a preprocessing task performed on numerical, continuous data.

In [81]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

NameError: name 'X' is not defined