# ANALYSIS OF THE COMMENTS

In [1]:
from datetime import timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%%time
#raw_data = pd.read_csv('./data/complete.csv')
scrubbed_data = pd.read_csv('../raw_data/scrubbed.csv', low_memory=False)
scrubbed_data.dtypes

CPU times: user 170 ms, sys: 26.3 ms, total: 196 ms
Wall time: 194 ms


datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

In [3]:
scrubbed_data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [4]:
# Clean text

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode


def clean (text:str):
    
    if isinstance(text,str):
                         
        for punctuation in string.punctuation:
            text = text.replace(punctuation, ' ') # Remove Punctuation

        lowercased = text.lower() # Lower Case
        unaccented_string = unidecode.unidecode(lowercased) # remove accents
        tokenized = word_tokenize(unaccented_string) # Tokenize
        words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
        stop_words = set(stopwords.words('english')) # Make stopword list
        without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words

        return without_stopwords
    
    else:
        
        return 'No comments'
    

scrubbed_data['clean_comments'] = scrubbed_data['comments'].apply(clean)

In [5]:
# Percentage of comments

no_comments_ = scrubbed_data[scrubbed_data['clean_comments'] == "No comments"].clean_comments.count()
print(f"Percentage of rows with comments = {round(1-(no_comments_/scrubbed_data.shape[0]),5)*100}%")

Percentage of rows with comments = 99.981%


In [6]:
scrubbed_data['clean_comments'].head()

0    [event, took, place, early, fall, around, occu...
1    [lackland, afb, tx, lights, racing, across, sk...
2    [green, orange, circular, disc, chester, england]
3    [older, brother, twin, sister, leaving, edna, ...
4    [marine, lt, flying, fighter, attack, aircraft...
Name: clean_comments, dtype: object

In [7]:
# Lemmatize text

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text:str):
    
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]   
    return ' '.join([w for w in lemmatized])

scrubbed_data['lemmatize_comments'] = scrubbed_data['clean_comments'].apply(lemmatize_text)

In [8]:
scrubbed_data['lemmatize_comments'].head()

0    event took place early fall around occurred bo...
1    lackland afb tx light racing across sky amp ma...
2           green orange circular disc chester england
3    older brother twin sister leaving edna theater...
4    marine lt flying fighter attack aircraft solo ...
Name: lemmatize_comments, dtype: object

## Let's identify topics mentioned in the comments using LatentDirichletAllocation

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Number of topics (Clusters) to be identified

N_TOPICS_ = 5; N_NGRAMS_ = 20; N_GRAM_MIN_ = 1; N_GRAM_MAX_ = 2

In [22]:
vectorizer = TfidfVectorizer(ngram_range=(N_GRAM_MIN_, N_GRAM_MAX_)).fit(scrubbed_data['lemmatize_comments'])
data_vectorized = vectorizer.transform(scrubbed_data['lemmatize_comments'])

lda_model = LatentDirichletAllocation(n_components=N_TOPICS_).fit(data_vectorized)
lda_vectors = lda_model.transform(data_vectorized)

In [23]:
topic_dict = {}
for idx, topic in enumerate(lda_model.components_):
        topic_dict[f"topic_{idx}"] = [(vectorizer.get_feature_names()[i], topic[i]) \
                                      for i in topic.argsort()[:-N_NGRAMS_ - 1:-1]]

In [24]:
def topic_list (topic_dict, index):
    
    list = []    
    for i in range(len(topic_dict[index])):    
        list.append(topic_dict[index][i][0])    
    return " | ".join(list)

dict_ = {}

for key in topic_dict.keys():    
    dict_[key] = topic_list (topic_dict, key)
    
comments_df = pd.DataFrame.from_dict(dict_, orient='index').reset_index()\
                                                           .rename(columns={'index':'Topic',0:'Content'})

### Select main topic per sighting

In [25]:
scrubbed_data['main_topic'] = pd.DataFrame(lda_vectors).apply(lambda row: f"topic_{row.argmax()}", axis=1)

for i in np.arange(0,N_TOPICS_):
    scrubbed_data[f"topic_{i}"] = lda_vectors [:,i]

In [26]:
scrubbed_data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,clean_comments,lemmatize_comments,main_topic,topic_0,topic_1,topic_2,topic_3,topic_4
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111,"[event, took, place, early, fall, around, occu...",event took place early fall around occurred bo...,topic_1,0.035158,0.858204,0.035896,0.035285,0.035457
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082,"[lackland, afb, tx, lights, racing, across, sk...",lackland afb tx light racing across sky amp ma...,topic_0,0.8505,0.037074,0.037408,0.03712,0.037899
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667,"[green, orange, circular, disc, chester, england]",green orange circular disc chester england,topic_3,0.049689,0.342339,0.049347,0.508103,0.050522
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833,"[older, brother, twin, sister, leaving, edna, ...",older brother twin sister leaving edna theater...,topic_1,0.034514,0.861046,0.034512,0.034558,0.03537
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611,"[marine, lt, flying, fighter, attack, aircraft...",marine lt flying fighter attack aircraft solo ...,topic_2,0.034353,0.034932,0.861635,0.034829,0.034251


In [27]:
features= ['datetime','city','comments','main_topic']
scrubbed_comments_df_ = scrubbed_data[features]

In [28]:
comments_df.head()

Unnamed: 0,Topic,Content
0,topic_0,light | orange | sky | bright | object | orang...
1,topic_1,light | object | sky | bright | quot | like | ...
2,topic_2,light | object | sky | bright | red | white | ...
3,topic_3,light | object | sky | bright | shaped | orang...
4,topic_4,light | sky | object | moving | bright | orang...


In [29]:
scrubbed_comments_df_.head()

Unnamed: 0,datetime,city,comments,main_topic
0,10/10/1949 20:30,san marcos,This event took place in early fall around 194...,topic_1
1,10/10/1949 21:00,lackland afb,1949 Lackland AFB&#44 TX. Lights racing acros...,topic_0
2,10/10/1955 17:00,chester (uk/england),Green/Orange circular disc over Chester&#44 En...,topic_3
3,10/10/1956 21:00,edna,My older brother and twin sister were leaving ...,topic_1
4,10/10/1960 20:00,kaneohe,AS a Marine 1st Lt. flying an FJ4B fighter/att...,topic_2


## Let's create clusters of topics with their corresponding ngrams

### > look at "scrubbed_comments_topics.csv" & "scrubbed_topics.csv"

In [30]:
comments_df.to_csv('../raw_data/scrubbed_topics.csv',header=True,index=False)
scrubbed_comments_df_.to_csv('../raw_data/scrubbed_comments_topics.csv',header=True,index=False)