In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### What is feature engineering?

- Creation of new features based on existing features
- Insight into relationships between features
- Extract and expand data
- Dataset dependent

***
### Encoding categorical variables


In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
hiking = pd.read_json('data/hiking.json')
hiking.head(3)

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,


In [5]:
# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


In [6]:
volunteer = pd.read_csv('data/volunteer_opportunities.csv')

In [7]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer.category_desc)

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


### Engineering numerical features

In [8]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### Engineering features from text
#### Extraction

In [9]:
import re

my_string = "temperature: 75.6 F"

pattern = re.compile(r"\d+\.\d+")

re.search(pattern, str(my_string))

<_sre.SRE_Match object; span=(13, 17), match='75.6'>

In [10]:
temp = re.search(pattern, str(my_string))

print(temp.group(0))

75.6


In [11]:
hiking['Length'] = hiking['Length'].apply(str)

In [13]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.search(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking['Length'].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


### Vectorizing text

- tf = term frequenct 
- idf = inverse document frequency

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

volunteer = pd.read_csv('data/volunteer_opportunities.csv')

volunteer = volunteer.dropna(subset=['category_desc'])

documents = volunteer.summary

documents.head()

1               Build a website for an Afghan business
2    Please join us and the students from Mott Hall...
3    The Oxfam Action Corps is a group of dedicated...
4    Stop 'N' Swap reduces NYC's waste by finding n...
5    Stop 'N' Swap reduces NYC's waste by finding n...
Name: summary, dtype: object

In [24]:
tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(documents)

### Text classification using naive Bayes
$$P(A|B)=\frac{P(A|B)P(A)}{P(B)}$$

In [25]:
# Take the title text
title_text = volunteer.title 

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [26]:
volunteer.category_desc.isna().sum()

0

In [28]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

In [29]:
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5225806451612903


In [None]:
!../gitbsh > /dev/null 2>&1