In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [9]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### Important concepts to remember
- Missing data `dropna()` and `notnull()`
- Types: `astype()`
- Stratified sampling: `train_test_split(X, y, stratify=y)`

In [10]:
ufo = pd.read_csv('data/ufo_sightings_large.csv')

In [11]:
# Check the column types
ufo.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [12]:
# Change the type of seconds to float
ufo["seconds"] = ufo.seconds.astype(float) 

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo.date)

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

seconds           float64
date       datetime64[ns]
dtype: object


In [13]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


- One hot encoding; `pd.get_dummies()`
- `var()`
- `np.log()`

In [19]:
import re

ufo['length_of_time'] = ufo['length_of_time'].astype(str)

In [20]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.search(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo[["minutes", "length_of_time"]].head())

   minutes   length_of_time
0      2.0          2 weeks
1     30.0           30sec.
2      NaN              nan
3      5.0  about 5 minutes
4      2.0                2


In [None]:
ufo.sen

In [22]:
# Check the variance of the seconds and minutes columns
print(ufo.seconds.var(), ufo.minutes.var())

31567346180.21459 842.5929293184239


In [24]:
# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

# Print out the variance of just the seconds_log column
print(ufo.seconds_log.var())

nan


In [14]:
!../gitbsh > /dev/null 2>&1