# Preprocessing for Machine Learning in Python

## Introduction to Data Preprocessing

In [1]:
import pandas as pd

volunteer = pd.read_csv('volunteer_opportunities.csv')
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [2]:
volunteer.shape

(665, 35)

In [3]:
volunteer.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

### Class imbalance

In [4]:
volunteer.category_desc.value_counts()

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

### Stratified sampling

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
# Create a data with all columns except category_desc
volunteer_X = volunteer.drop('category_desc', axis=1)

# Create a category_desc labels dataset
volunteer_y = volunteer[['category_desc']]

# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

# Print out the category_desc counts on the training y labels
print(y_train['category_desc'].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


In [7]:
y_train.shape

(498, 1)

## Standardizing Data


### Modeling without normalizing

In [8]:
wine = pd.read_csv('wine_types.csv')
wine.head()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [9]:
wine.shape

(178, 14)

In [10]:
y = wine.Type
X = wine[['Proline', 'Total phenols', 'Hue', 'Nonflavanoid phenols']]

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [12]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.7333333333333333


In [13]:
wine.var()

Type                                0.600679
Alcohol                             0.659062
Malic acid                          1.248015
Ash                                 0.075265
Alcalinity of ash                  11.152686
Magnesium                         203.989335
Total phenols                       0.391690
Flavanoids                          0.997719
Nonflavanoid phenols                0.015489
Proanthocyanins                     0.327595
Color intensity                     5.374449
Hue                                 0.052245
OD280/OD315 of diluted wines        0.504086
Proline                         99166.717355
dtype: float64

The Proline column has an extremely high variance.

### Log normalization in Python

In [14]:
import numpy as np

In [15]:
# Print out the variance of the Proline column
print(wine.Proline.var())

# Apply the log normalization function to the Proline column
wine['Proline_log'] = np.log(wine.Proline)

# Check the variance of the Proline column again
print(wine.Proline_log.var())

99166.71735542436
0.17231366191842012


In [16]:
wine.Ash.describe()

count    178.000000
mean       2.366517
std        0.274344
min        1.360000
25%        2.210000
50%        2.360000
75%        2.557500
max        3.230000
Name: Ash, dtype: float64

In [17]:
wine['Alcalinity of ash'].describe()

count    178.000000
mean      19.494944
std        3.339564
min       10.600000
25%       17.200000
50%       19.500000
75%       21.500000
max       30.000000
Name: Alcalinity of ash, dtype: float64

In [18]:
wine.Magnesium.describe()

count    178.000000
mean      99.741573
std       14.282484
min       70.000000
25%       88.000000
50%       98.000000
75%      107.000000
max      162.000000
Name: Magnesium, dtype: float64

### Scaling data - standardizing columns

In [19]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[['Ash', 'Alcalinity of ash', 'Magnesium']]

# Apply the scaler to the DataFrame subset
wine_subset_scaled = ss.fit_transform(wine_subset)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### KNN on non-scaled data

In [20]:
X = wine.drop('Type', axis=1)
y = wine.Type

In [21]:
X = X.drop('Proline_log', axis=1)

In [22]:
X.shape

(178, 13)

In [23]:
knn = KNeighborsClassifier()

In [24]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.6666666666666666


### KNN on scaled data

In [25]:
# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train, y_train)

# Score the model on the test data.
print(knn.score(X_test, y_test))

0.9333333333333333


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## Feature Engineering


In [26]:
hiking = pd.read_json('hiking.json')
hiking.head()

Unnamed: 0,Accessible,Difficulty,Length,Limited_Access,Location,Name,Other_Details,Park_Name,Prop_ID,lat,lon
0,Y,,0.8 miles,N,"Enter behind the Salt Marsh Nature Center, loc...",Salt Marsh Nature Trail,<p>The first half of this mile-long trail foll...,Marine Park,B057,,
1,N,Easy,1.0 mile,N,Enter Park at Lincoln Road and Ocean Avenue en...,Lullwater,Explore the Lullwater to see how nature thrive...,Prospect Park,B073,,
2,N,Easy,0.75 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Midwood,Step back in time with a walk through Brooklyn...,Prospect Park,B073,,
3,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Peninsula,Discover how the Peninsula has changed over th...,Prospect Park,B073,,
4,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Waterfall,Trace the source of the Lake on the Waterfall ...,Prospect Park,B073,,


### Encoding categorical variables - binary

In [27]:
from sklearn.preprocessing import LabelEncoder

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


### Encoding categorical variables - one-hot

In [43]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer['category_desc'])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


### Engineering numerical features - taking an average

In [None]:
# Create a list of the columns to average
run_columns = ['run1', 'run2', 'run3', 'run4', 'run5']

# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

# Take a look at the results
print(running_times_5k)

### Engineering numerical features - datetime

In [31]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].apply(lambda row: row.month)

# Take a look at the original and new columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### Engineering features from strings - extraction

In [44]:
hiking["Length"].dropna(inplace=True)

In [45]:
import re 

# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile("\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking['Length'].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


### Engineering features from strings - tf/idf

In [46]:
volunteer_drop = volunteer[['category_desc','title']]

In [84]:
volunteer_drop.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Take the title text
title_text = volunteer_drop['title']

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

### Text classification using tf/idf vectors

In [86]:
text_tfidf.toarray().shape

(617, 1089)

In [88]:
# Split the dataset according to the class distribution of category_desc
y = volunteer_drop["category_desc"]

In [89]:
y.shape

(617,)

In [90]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

train_X, test_X, train_y, test_y = train_test_split(text_tfidf.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X, train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

0.5548387096774193


## Selecting features for modeling


### Selecting relevant features

In [92]:
# Create a list of redundant column names to drop
to_drop = ["category_desc", "created_date", "locality", "region", "vol_requests"]

In [93]:
volunteer[to_drop].head()

Unnamed: 0,category_desc,created_date,locality,region,vol_requests
0,,January 13 2011,,NY,50
1,Strengthening Communities,January 14 2011,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,2
2,Strengthening Communities,January 19 2011,,NY,20
3,Strengthening Communities,January 21 2011,,NY,500
4,Environment,January 28 2011,,NY,15


In [94]:
# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head())

   opportunity_id  content_id  event_time  \
0            4996       37004           0   
1            5008       37036           0   
2            5016       37143           0   
3            5022       37237           0   
4            5055       37425           0   

                                               title  hits  \
0  Volunteers Needed For Rise Up & Stay Put! Home...   737   
1                                       Web designer    22   
2      Urban Adventures - Ice Skating at Lasker Rink    62   
3  Fight global hunger and support women farmers ...    14   
4                                      Stop 'N' Swap    31   

                                             summary is_priority  category_id  \
0  Building on successful events last summer and ...         NaN          NaN   
1             Build a website for an Afghan business         NaN          1.0   
2  Please join us and the students from Mott Hall...         NaN          1.0   
3  The Oxfam Action Corps is a g

### Checking for correlated features

In [95]:
# Print out the column correlations of the wine dataset
display(wine.corr())

# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Flavanoids"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,Proline_log
Type,1.0,-0.328222,0.437776,-0.049643,0.517859,-0.209179,-0.719163,0.489109,-0.49913,0.265668,-0.617369,-0.78823,-0.633717,-0.569246
Alcohol,-0.328222,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372,0.637325
Malic acid,0.437776,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011,-0.152643
Ash,-0.049643,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626,0.238394
Alcalinity of ash,0.517859,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597,-0.416897
Magnesium,-0.209179,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351,0.424006
Total phenols,-0.719163,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115,0.431205
Nonflavanoid phenols,0.489109,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385,-0.275675
Proanthocyanins,-0.49913,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417,0.290203
Color intensity,0.265668,0.546364,0.248985,0.258887,0.018732,0.19995,-0.055136,0.139057,-0.02525,1.0,-0.521813,-0.428815,0.3161,0.34897


ValueError: labels ['Flavanoids'] not contained in axis

### Exploring text vectors, part 1

In [96]:
vocab = {v:k for k,v in tfidf_vec.vocabulary_.items()}

In [97]:
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

[189, 942, 466]


In [98]:
text_tfidf[8].indices

array([498, 942, 189, 466, 717, 409], dtype=int32)

In [99]:
text_tfidf[8].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [100]:
text_tfidf[8].toarray().shape

(1, 1089)

In [101]:
text_tfidf[8].data

array([0.23125626, 0.40223995, 0.67322092, 0.37801806, 0.27439553,
       0.33661046])

### Exploring text vectors, part 2

In [102]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]

In [103]:
filtered_text

<617x1006 sparse matrix of type '<class 'numpy.float64'>'
	with 2945 stored elements in Compressed Sparse Row format>

### Training Naive Bayes with feature selection

In [104]:
np = GaussianNB()

# Split the dataset according to the class distribution of category_desc, using the filtered_text vector
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X, train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

0.6


### Using PCA

In [64]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()

wine_X = wine.drop("Type", axis=1)

# Apply PCA to the wine dataset X vector
transformed_X = pca.fit_transform(wine_X)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.98098724e-01 1.73593316e-03 9.43288311e-05 4.89462522e-05
 1.04706853e-05 5.61289777e-06 2.79970917e-06 1.44536069e-06
 9.76685744e-07 3.94374158e-07 2.15136165e-07 9.01993593e-08
 6.27152353e-08]


### Training a model with PCA

In [65]:
knn =  GaussianNB()

In [66]:
y = wine.dropna().Type

In [67]:
transformed_X.shape

(178, 13)

In [68]:
y.shape

(178,)

In [69]:
# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X, y)

# Fit knn to the training data
knn.fit(X_wine_train, y_wine_train)

# Score knn on the test data and print it out
print(knn.score(X_wine_test, y_wine_test))

0.9555555555555556


## Putting it all together


In [70]:
ufo = pd.read_csv('ufo_sightings_large.csv')
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [71]:
ufo.shape

(4935, 11)

In [72]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
date              4935 non-null object
city              4926 non-null object
state             4516 non-null object
country           4255 non-null object
type              4776 non-null object
seconds           4935 non-null float64
length_of_time    4792 non-null object
desc              4932 non-null object
recorded          4935 non-null object
lat               4935 non-null object
long              4935 non-null float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB


In [73]:
# Check the column types
display(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo['seconds'].astype('float')

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo['date'])

# Check the column types
display(ufo[['seconds', 'date']].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

seconds           float64
date       datetime64[ns]
dtype: object

### Dropping missing data

In [74]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[['length_of_time', 'state', 'type']].isnull().sum())

length_of_time    143
state             419
type              159
dtype: int64


In [75]:
# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[
        ufo['length_of_time'].notnull() & 
        ufo['state'].notnull() & 
        ufo['type'].notnull()
]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

(4283, 11)


### Extracting numbers from strings

In [76]:
ufo = ufo_no_missing.copy()

In [77]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,-117.156389


In [78]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo_no_missing["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
ufo[['length_of_time', 'minutes']].head()

Unnamed: 0,length_of_time,minutes
0,2 weeks,2.0
1,30sec.,30.0
3,about 5 minutes,
4,2,2.0
5,10 minutes,10.0


### Identifying features for standardization

In [79]:
ufo.var()

seconds    1.545212e+10
long       4.119751e+02
minutes    9.470577e+02
dtype: float64

In [80]:
import numpy as np

In [81]:
# Check the variance of the seconds and minutes columns
print(ufo[['seconds','minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

seconds    1.545212e+10
minutes    9.470577e+02
dtype: float64


  """


In [82]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long,minutes,seconds_log
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111,2.0,14.0058
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556,30.0,3.401197
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222,,5.703782
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333,2.0,-inf
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,-117.156389,10.0,6.39693


In [106]:
ufo.seconds.max()

6312000.0

### Encoding categorical variables

In [107]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda row: 1 if row == 'us' else 0)

# Print the number of unique type values
print(len(ufo.type.unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo.type)

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

21


In [108]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,...,flash,formation,light,other,oval,rectangle,sphere,teardrop,triangle,unknown
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,...,0,0,0,0,0,0,0,0,0,1
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,...,0,0,0,0,0,0,0,0,0,0
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,...,0,0,0,0,0,0,0,0,1,0
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,...,0,0,0,0,1,0,0,0,0,0
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,...,0,0,1,0,0,0,0,0,0,0


### Features from dates

In [109]:
# Look at the first 5 rows of the date column
print(ufo.date.head())

# Extract the month from the date column
ufo["month"] = ufo["date"].dt.month

# Extract the year from the date column
ufo["year"] = ufo["date"].dt.year

# Take a look at the head of all three columns
print(ufo[['date', 'month', 'year']])

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
Name: date, dtype: datetime64[ns]
                    date  month  year
0    2011-11-03 19:21:00     11  2011
1    2004-10-03 19:05:00     10  2004
3    2002-11-21 05:45:00     11  2002
4    2010-08-19 12:55:00      8  2010
5    2012-06-16 23:00:00      6  2012
6    2009-07-12 21:30:00      7  2009
7    2008-10-20 18:30:00     10  2008
8    2013-06-09 00:00:00      6  2013
9    2013-04-26 23:27:00      4  2013
10   2013-09-13 20:30:00      9  2013
11   2013-02-13 21:00:00      2  2013
12   2012-05-20 00:00:00      5  2012
13   2011-06-02 22:05:00      6  2011
14   1999-09-01 20:30:00      9  1999
15   2013-09-04 21:50:00      9  2013
16   2011-05-24 22:15:00      5  2011
17   1993-05-15 02:30:00      5  1993
18   2009-06-22 21:00:00      6  2009
19   2008-09-22 17:40:00      9  2008
20   1984-11-15 18:00:00     11  1984
21   2010-03-11 19:00:00      3  2010
22   1931-

### Text vectorization

In [110]:
# Take a look at the head of the desc field
print(ufo['desc'].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo.desc)

# Look at the number of columns this creates
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
5    Dancing lights that would fly around and then ...
Name: desc, dtype: object
(4283, 5754)


### Selecting the ideal dataset

In [112]:
vocab = {v:k for k,v in vec.vocabulary_.items()}

In [113]:
# Check the correlation between the seconds, seconds_log, and minutes columns
display(ufo[['seconds', 'seconds_log', 'minutes']].corr())

# Make a list of features to drop
to_drop = ['city', 'country', 'lat','long', 'state', 'seconds', 'minutes', 'date', 'desc', 'length_of_time', 'recorded']

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

Unnamed: 0,seconds,seconds_log,minutes
seconds,1.0,0.174331,-0.009932
seconds_log,0.174331,1.0,0.11146
minutes,-0.009932,0.11146,1.0


### Modeling the UFO dataset, part 1

In [115]:
ufo_dropped.columns

Index(['type', 'seconds_log', 'country_enc', 'changing', 'chevron', 'cigar',
       'circle', 'cone', 'cross', 'cylinder', 'diamond', 'disk', 'egg',
       'fireball', 'flash', 'formation', 'light', 'other', 'oval', 'rectangle',
       'sphere', 'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')

In [116]:
X = ufo_dropped[['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year']]

In [117]:
y = ufo_dropped['country_enc']

In [118]:
X.shape, y.shape

((4283, 24), (4283,))

In [None]:
# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(X,y, stratify=y)

# Fit knn to the training sets
knn.fit(train_X, train_y)

# Print the score of knn on the test sets
print(knn.score(test_X, test_y))

### Modeling the UFO dataset, part 2

In [None]:
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

# Split the X and y sets using train_test_split, setting stratify=y 
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit nb to the training sets
nb.fit(train_X, train_y)

# Print the score of nb on the test sets
print(nb.score(test_X, test_y))

This is a clear case where iteration would be necessary to figure out what subset of text improves the model, and if perhaps any of the other features are useful in predicting type