In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
ufo=pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/a5ebfe5d2ed194f2668867603b563963af4769e9/ufo_sightings_large.csv')


In [0]:
# change data types of column 'seconds' and 'date'
ufo["seconds"] = ufo['seconds'].astype('float')
ufo["date"] = pd.to_datetime(ufo['date'])



In [0]:
# drop missing data using isnull() and notnull()

print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]


In [0]:
# Extract minutes from lenght_of _time column,to extract data from text, we use regular expression
def return_minutes(time_string):
    pattern = re.compile(r"\d+")
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
    
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(lambda x: return_minutes(x))

print(ufo[["length_of_time", "minutes"]].head())

In [0]:
# identify features that need standardization
# Check the variance of the seconds and minutes columns, we can see the seconds column has a very large variance, which will introduce too much noise in our dataset, lets nomalize it
print(ufo[["seconds", "minutes"]].var())
ufo["seconds_log"] = np.log(ufo["seconds"])


In [0]:
#Encoding categorical variables using binary and one hot encoding

ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)
print(len(ufo["type"].unique()))
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

In [0]:
# apply feature engineering to the date column

ufo["month"] = ufo["date"].apply(lambda x:x.month)
ufo["year"] = ufo["date"].apply(lambda x:x.year)

In [0]:
# Vectorizing Text
# Create the tfidf vectorizer object
vec = TfidfVectorizer()
desc_tfidf = vec.fit_transform(ufo.desc)


In [0]:
# selecting ideal dataset by dropping redundant features
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    return set(filter_list)
print(ufo[["seconds", "seconds_log", "minutes"]].corr())
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]
ufo_dropped = ufo.drop(to_drop, axis=1)
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

In [0]:
# After preprocessing, we build a k-nearest neighbor model to predict which country the UFO sighting took place in, we have a imbalance dataset, therefore we will use stratify = y 

train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

# Fit knn to the training sets
knn.fit(train_X, train_y)

# Print the score of knn on the test sets
print(knn.score(test_X, test_y))