# ANALYSIS OF THE COMMENTS

In [1]:
from datetime import timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%%time
#raw_data = pd.read_csv('./data/complete.csv')
scrubbed_data = pd.read_csv('../raw_data/scrubbed.csv', low_memory=False)
scrubbed_data.dtypes

CPU times: user 181 ms, sys: 18.8 ms, total: 200 ms
Wall time: 198 ms


datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

In [3]:
scrubbed_data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


## Let's identify topics mentioned in the comments

In [4]:
# Clean text

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode


def clean (text:str):
    
    if isinstance(text,str):
                         
        for punctuation in string.punctuation:
            text = text.replace(punctuation, ' ') # Remove Punctuation

        lowercased = text.lower() # Lower Case
        unaccented_string = unidecode.unidecode(lowercased) # remove accents
        tokenized = word_tokenize(unaccented_string) # Tokenize
#         words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
        words_only = [word for word in tokenized] # Keep numbers
        stop_words = set(stopwords.words('english')) # Make stopword list
        without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words

        return without_stopwords
    
    else:
        
        return 'No comments'
    

scrubbed_data['clean_comments'] = scrubbed_data['comments'].apply(clean)

In [5]:
# Percentage of comments

no_comments_ = scrubbed_data[scrubbed_data['clean_comments'] == "No comments"].clean_comments.count()
print(f"Percentage of rows with comments = {round(1-(no_comments_/scrubbed_data.shape[0]),5)*100}%")

Percentage of rows with comments = 99.981%


In [6]:
scrubbed_data['clean_comments'].head()

0    [event, took, place, early, fall, around, 1949...
1    [1949, lackland, afb, 44, tx, lights, racing, ...
2    [green, orange, circular, disc, chester, 44, e...
3    [older, brother, twin, sister, leaving, edna, ...
4    [marine, 1st, lt, flying, fj4b, fighter, attac...
Name: clean_comments, dtype: object

In [7]:
# Lemmatize text

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text:str):
    
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]   
    return ' '.join([w for w in lemmatized])

scrubbed_data['lemmatize_comments'] = scrubbed_data['clean_comments'].apply(lemmatize_text)

In [8]:
scrubbed_data['lemmatize_comments'].head()

0    event took place early fall around 1949 50 occ...
1    1949 lackland afb 44 tx light racing across sk...
2        green orange circular disc chester 44 england
3    older brother twin sister leaving edna theater...
4    marine 1st lt flying fj4b fighter attack aircr...
Name: lemmatize_comments, dtype: object

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
N_TOPICS_ = 10

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(2, 3)).fit(scrubbed_data['lemmatize_comments'])
data_vectorized = vectorizer.transform(scrubbed_data['lemmatize_comments'])

lda_model = LatentDirichletAllocation(n_components=N_TOPICS_).fit(data_vectorized)
lda_vectors = lda_model.transform(data_vectorized)

In [12]:
topic_dict = {}
for idx, topic in enumerate(lda_model.components_):
        topic_dict[f"topic_{idx}"] = [(vectorizer.get_feature_names()[i], topic[i]) \
                                      for i in topic.argsort()[:-20 - 1:-1]]

In [13]:
def topic_list (topic_dict, index):
    
    list = []
    
    for i in range(len(topic_dict[index])):    
        list.append(topic_dict[index][i][0])
    
    return " | ".join(list)

dict_ = {}

for key in topic_dict.keys():    
    dict_[key] = topic_list (topic_dict, key)
    
comments_df = pd.DataFrame.from_dict(dict_, orient='index').reset_index()\
                                                           .rename(columns={'index':'Topic',0:'Content'})

### Select main topic per sighting

In [14]:
scrubbed_data['main_topic'] = pd.DataFrame(lda_vectors).apply(lambda row: f"topic_{row.argmax()}", axis=1)

for i in np.arange(0,N_TOPICS_):
    scrubbed_data[f"topic_{i}"] = lda_vectors [:,i]

In [15]:
scrubbed_data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,...,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,...,0.015673,0.015673,0.015678,0.015673,0.858932,0.015673,0.015675,0.015674,0.015673,0.015676
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,...,0.016366,0.016356,0.016362,0.016356,0.016356,0.01636,0.016357,0.016357,0.852774,0.016356
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,...,0.023316,0.023318,0.023318,0.023319,0.790135,0.02332,0.023323,0.023318,0.023316,0.023318
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,...,0.016699,0.016699,0.016699,0.016699,0.0167,0.016699,0.016699,0.849709,0.016699,0.016699
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,...,0.872803,0.014133,0.014133,0.014133,0.014133,0.014133,0.014133,0.014133,0.014133,0.014133


In [18]:
features= ['datetime','city','lemmatize_comments','main_topic']
scrubbed_comments_df_ = scrubbed_data[features]

In [19]:
comments_df.head()

Unnamed: 0,Topic,Content
0,topic_0,bright light | nuforc note | across sky | shap...
1,topic_1,white light | bright light | light sky | stran...
2,topic_2,bright light | light sky | white light | flash...
3,topic_3,bright light | white light | nuforc note | ora...
4,topic_4,orange light | white light | shaped object | r...


In [17]:
scrubbed_comments_df_.head()

Unnamed: 0,datetime,city,lemmatize_comments,main_topic
0,10/10/1949 20:30,san marcos,event took place early fall around 1949 50 occ...,topic_4
1,10/10/1949 21:00,lackland afb,1949 lackland afb 44 tx light racing across sk...,topic_8
2,10/10/1955 17:00,chester (uk/england),green orange circular disc chester 44 england,topic_4
3,10/10/1956 21:00,edna,older brother twin sister leaving edna theater...,topic_7
4,10/10/1960 20:00,kaneohe,marine 1st lt flying fj4b fighter attack aircr...,topic_0


## Let's create 10 clusters of topics with 20 ngrams ( bigrams or trigrams ) each

### > look at "scrubbed_comments_topics.csv" & "scrubbed_topics.csv"

In [20]:
comments_df.to_csv('../raw_data/scrubbed_topics.csv',header=True,index=False)
scrubbed_comments_df_.to_csv('../raw_data/scrubbed_comments_topics.csv',header=True,index=False)