In [2]:
# run this to shorten the data import from the files
path_data = '/home/nero/Documents/Estudos/DataCamp/Python/Preprocessing_for_Machine_Learning_in_Python/datasets/'
import pandas as pd

In [3]:
# load dataset
data_path = path_data + 'hiking.json'
hiking = pd.read_json(data_path)

# load module
from sklearn.preprocessing import LabelEncoder

In [4]:
# exercise 01

"""
Encoding categorical variables - binary

Take a look at the hiking dataset. There are several columns here that need encoding before they can be modeled, one of which is the Accessible column. Accessible is a binary feature, so it has two values, Y or N, which need to be encoded into 1's and 0's. Use scikit-learn's LabelEncoder method to perform this transformation.
"""

# Instructions

"""

    Store LabelEncoder() in a variable named enc.
    Using the encoder's .fit_transform() method, encode the hiking dataset's "Accessible" column. Call the new column Accessible_enc.
    Compare the two columns side-by-side to see the encoding.

"""

# solution

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible_enc', 'Accessible']].head())

#----------------------------------#

# Conclusion

"""
Nice work! .fit_transform() is a good way to both fit an encoding and transform the data in a single step.
"""

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N


'\nNice work! .fit_transform() is a good way to both fit an encoding and transform the data in a single step.\n'

In [5]:
# load volunteer dataset
volunteer = pd.read_csv(path_data + 'volunteer_clean.csv')

In [6]:
# exercise 02

"""
Encoding categorical variables - one-hot

One of the columns in the volunteer dataset, category_desc, gives category descriptions for the volunteer opportunities listed. Because it is a categorical variable with more than two categories, we need to use one-hot encoding to transform this column numerically. Use pandas' pd.get_dummies() function to do so.
"""

# Instructions

"""

    Call get_dummies() on the volunteer["category_desc"] column to create the encoded columns and assign it to category_enc.
    Print out the .head() of the category_enc variable to take a look at the encoded columns.

"""

# solution

# Transform the category_desc column
category_enc = pd.get_dummies(volunteer['category_desc'], dtype = int)

# Take a look at the encoded columns
print(category_enc.head())

#----------------------------------#

# Conclusion

"""
Good job! get_dummies() is a simple and quick way to encode categorical variables.
"""

   Education  Emergency Preparedness  Environment  Health   
0          0                       0            0       0  \
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            1       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          1  
1                          0                          1  
2                          0                          1  
3                          0                          0  
4                          0                          0  


'\nGood job! get_dummies() is a simple and quick way to encode categorical variables.\n'

In [7]:
# load data set
running_times_5k = pd.read_csv(path_data + 'running_times_5k.csv')

In [8]:
# exercise 03

"""
Aggregating numerical features

A good use case for taking an aggregate statistic to create a new feature is when you have many features with similar, related values. Here, you have a DataFrame of running times named running_times_5k. For each name in the dataset, take the mean of their 5 run times.
"""

# Instructions

"""

    Use the .loc[] method to select all rows and columns to find the .mean() of the each columns.
    Print the .head() of the DataFrame to see the mean column.

"""

# solution

# Use .loc to create a mean column
running_times_5k["mean"] = running_times_5k.loc[:, 'run1':'run5'].mean(axis=1)

# Take a look at the results
print(running_times_5k.head())

#----------------------------------#

# Conclusion

"""
Nice work! .loc[] is especially helpful for operating across columns.
"""

   Unnamed: 0   name  run1  run2  run3  run4  run5   mean
0         0.0    Sue  20.1  18.5  19.6  20.3  18.3  19.36
1         1.0   Mark  16.5  17.1  16.9  17.6  17.3  17.08
2         2.0   Sean  23.5  25.1  25.2  24.6  23.9  24.46
3         3.0   Erin  21.7  21.1  20.9  22.1  22.2  21.60
4         4.0  Jenny  25.8  27.1  26.1  26.7  26.9  26.52


'\nNice work! .loc[] is especially helpful for operating across columns.\n'

In [9]:
# exercise 04

"""
Extracting datetime components

There are several columns in the volunteer dataset comprised of datetimes. Let's take a look at the start_date_date column and extract just the month to use as a feature for modeling.
"""

# Instructions

"""

    Convert the start_date_date column into a pandas datetime column and store it in a new column called start_date_converted.
    Retrieve the month component of start_date_converted and store it in a new column called start_date_month.
    Print the .head() of just the start_date_converted and start_date_month columns.

"""

# solution

# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].dt.month

# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

#----------------------------------#

# Conclusion

"""
Awesome! You can also use attributes like .day to get the day and .year to get the year from datetime columns.
"""

  start_date_converted  start_date_month
0           2011-02-01                 2
1           2011-01-29                 1
2           2011-02-14                 2
3           2011-02-05                 2
4           2011-02-12                 2


'\nAwesome! You can also use attributes like .day to get the day and .year to get the year from datetime columns.\n'

In [10]:
# exercise 05

"""
Extracting string patterns

The Length column in the hiking dataset is a column of strings, but contained in the column is the mileage for the hike. We're going to extract this mileage using regular expressions, and then use a lambda in pandas to apply the extraction to the DataFrame.
"""

# Instructions

"""

    Search the text in the length argument for numbers and decimals using an appropriate pattern.
    Extract the matched pattern and convert it to a float.
    Apply the return_mileage() function to each row in the hiking["Length"] column.

"""

# solution
import re
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    
    # Search the text for matches
    mile = re.search('\d+\.\d+', str(length))
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking['Length'].apply(return_mileage)
print(hiking[["Length", "Length_num"]].head())

#----------------------------------#

# Conclusion

"""
Great job! Regular expressions are a useful way to perform text extraction.
"""

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


'\nGreat job! Regular expressions are a useful way to perform text extraction.\n'

In [11]:
# load module
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# exercise 06

"""
Vectorizing text

You'll now transform the volunteer dataset's title column into a text vector, which you'll use in a prediction task in the next exercise.
"""

# Instructions

"""

    Store the volunteer["title"] column in a variable named title_text.
    Instantiate a TfidfVectorizer as tfidf_vec.
    Transform the text in title_text into a tf-idf vector using tfidf_vec.

"""

# solution

# Take the title text
title_text = volunteer['title']

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

#----------------------------------#

# Conclusion

"""
Nice job. scikit-learn provides several methods for text vectorization.
"""

'\nNice job. scikit-learn provides several methods for text vectorization.\n'

In [13]:
title_text

0                                           Web designer
1          Urban Adventures - Ice Skating at Lasker Rink
2      Fight global hunger and support women farmers ...
3                                          Stop 'N' Swap
4                                   Queens Stop 'N' Swap
                             ...                        
607            Volunteer for NYLAG's Food Stamps Project
608      Iridescent Science Studio Open House Volunteers
609                                    French Translator
610                    Marketing & Advertising Volunteer
611    Volunteer filmmakers to help Mayor's Office wi...
Name: title, Length: 612, dtype: object

In [18]:
# load modules
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

nb = GaussianNB()

In [20]:
# exercise 07

"""
Text classification using tf/idf vectors

Now that you've encoded the volunteer dataset's title column into tf/idf vectors, you'll use those vectors to predict the category_desc column.
"""

# Instructions

"""

    Split the text_tfidf vector and y target variable into training and test sets, setting the stratify parameter equal to y, since the class distribution is uneven. Notice that we have to run the .toarray() method on the tf/idf vector, in order to get in it the proper format for scikit-learn.
    Fit the X_train and y_train data to the Naive Bayes model, nb.
    Print out the test set accuracy.

"""

# solution

# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

#----------------------------------#

# Conclusion

"""
Nice work! Notice that the model doesn't score very well. We'll work on selecting the best features for modeling in the next chapter.
"""

0.47058823529411764


"\nNice work! Notice that the model doesn't score very well. We'll work on selecting the best features for modeling in the next chapter.\n"