In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### Important concepts to remember
- Missing data `dropna()` and `notnull()`
- Types: `astype()`
- Stratified sampling: `train_test_split(X, y, stratify=y)`

In [3]:
ufo = pd.read_csv('data/ufo_sightings_large.csv')

In [4]:
# Check the column types
ufo.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [5]:
# Change the type of seconds to float
ufo["seconds"] = ufo.seconds.astype(float) 

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo.date)

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

seconds           float64
date       datetime64[ns]
dtype: object


In [6]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


- One hot encoding; `pd.get_dummies()`
- `var()`
- `np.log()`

In [7]:
import re

ufo['length_of_time'] = ufo['length_of_time'].astype(str)

In [8]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.search(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo[["minutes", "length_of_time"]].head())

   minutes   length_of_time
0      2.0          2 weeks
1     30.0           30sec.
2      NaN              nan
3      5.0  about 5 minutes
4      2.0                2


In [13]:
ufo.seconds.dtype

dtype('float64')

In [14]:
# Check the variance of the seconds and minutes columns
print(ufo.seconds.var(), ufo.minutes.var())

31567346180.21459 842.5929293184239


In [26]:
# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

In [27]:
ufo['seconds_log'] = ufo.seconds_log.replace([np.inf, -np.inf], np.nan).fillna(method='pad')

In [28]:
# Print out the variance of just the seconds_log column
print(ufo.seconds_log.var())

4.874146416206326


In [30]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda x: 1 if x == 'us' else 0)

In [31]:
# Print the number of unique type values
print(len(ufo.type.unique()))

22


In [32]:
# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo.type) 

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

In [36]:
# Look at the first 5 rows of the date column
print(ufo.date.head())

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo[['date','month','year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
2   2009-09-25 21:00:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010


In [11]:
!../gitbsh > /dev/null 2>&1