In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### Important concepts to remember
- Missing data `dropna()` and `notnull()`
- Types: `astype()`
- Stratified sampling: `train_test_split(X, y, stratify=y)`

In [3]:
ufo = pd.read_csv('data/ufo_sightings_large.csv')

In [4]:
# Check the column types
ufo.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [8]:
# Change the type of seconds to float
ufo["seconds"] = ufo.seconds.astype(float) 

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo.date)

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

seconds           float64
date       datetime64[ns]
dtype: object


In [14]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [28]:
!../gitbsh > /dev/null 2>&1