In [1]:
import json as json
import numpy as np
import pandas as pd

Read json containing title and description. See parse_metadata.py. Currently only processing title and description to reduce memory consumption

In [2]:
filename='nasa_text2.json'
mytext = pd.read_json(filename)

Peek json contents

In [3]:
mytext.tail()

Unnamed: 0,description,title
32084,Prognostics methodologies determine the health...,Implementation of Prognostic Methodologies to ...
32085,Studying and analyzing the ageing mechanisms o...,DIAGNOSTIC/PROGNOSTIC EXPERIMENTS FOR CAPACITO...
32086,This paper discusses our initial efforts in co...,Prognostic Techniques for Capacitor Degradatio...
32087,This document outlines NASA's IT management an...,OCIO FITARA Common Baseline Implementation Pla...
32088,This is an API for the Earth Polychromatic Ima...,EPIC Daily Blue Marble API


Count number of rows 

In [26]:
num_rows = mytext.title.count()
print num_rows

32089


Since processing all rows in the mytext dataframe kills the jupyter kernel due to large memory consumption, I'm taking only 50 random rows for now 

In [28]:
import random
random_indices = random.sample(xrange(num_rows), 50)
print random_indices

[29397, 13510, 3719, 5484, 22113, 2468, 929, 24203, 11467, 754, 22888, 27960, 19439, 15474, 19383, 10916, 10801, 6153, 22037, 12571, 4534, 23975, 18252, 28541, 16950, 10785, 1284, 18320, 3690, 622, 9952, 674, 16291, 24788, 31365, 28933, 19653, 907, 139, 15680, 15223, 7260, 24489, 29647, 3406, 9174, 22735, 1635, 31572, 1810]


In [68]:
text_sample = mytext.take(random_indices)

Download NLTK stopwords (only need this once)

In [64]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alexisc/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [65]:
from nltk.corpus import stopwords
nltk_stop = stopwords.words('english')

Get top unique words in the 'title' field, not including nltk stop words.

In [69]:
sample_title_count = text_sample['title'].apply(lambda x : pd.value_counts(x.split(" "))).sum(axis=0)

In [91]:
non_stop_keys = sample_title_count[~sample_title_count.keys().isin(nltk_stop)]
print non_stop_keys 

(1.25x1.25L42)     1
(AIRS+AMSU+HSB)    1
(AIRS-only)        1
(C3PO)             1
(eta               1
0.25x0.25          1
1.0x1.0            1
1x1                1
2                  1
2/3x1/2L73)        1
2001-2004          1
3                  1
3-Hourly           1
3D                 2
4                  1
...
air             1
associated      1
colonization    1
coord,          1
deg             3
duration        1
flight          1
km              1
long            1
physical        2
plants          1
quality         1
retrieval       2
space           2
standard        1
Length: 239, dtype: float64


In [92]:
non_stop_keys.sort(ascending=False)

In [93]:
print non_stop_keys

Project     12
Data         6
BOREAS       5
Daily        5
Grid         4
Global       4
V005         3
LBA-ECO      3
Level        3
Water        3
Control      3
V003         3
deg          3
OMI/Aura     3
Soil         3
...
PDS                    1
Outputs                1
Optimization           1
NS001                  1
Observation            1
OSPO                   1
OCTS_L3m_MO_POC_9km    1
OCTS_L3b_SNWI_PIC      1
OCEAN                  1
Non                    1
Nitrogen               1
Nighttime              1
New                    1
Near                   1
(1.25x1.25L42)         1
Length: 239, dtype: float64


Get top unique words in the 'description' field, not including nltk stop words.

In [103]:
sample_desc_count = text_sample['description'].apply(lambda x : pd.value_counts(x.split(" "))).sum(axis=0)

In [105]:
desc_non_stop_keys = sample_desc_count[~sample_desc_count.keys().isin(nltk_stop)]
desc_non_stop_keys.sort(ascending=False)
print desc_non_stop_keys 

            981
data         80
The          68
product      33
\n           28
MODIS        27
This         21
global       19
contains     18
NASA         17
Data         17
These        16
based        16
Earth        16
daily        15
...
documentation     1
documentation,    1
double-quoted     1
drag              1
drastically       1
draw              1
due               1
duration          1
speculation       1
easily            1
eddy              1
spectrum          1
effective,        1
effects           1
deviation         1
Length: 2308, dtype: float64


Remove additional stop words from previous exploration

In [106]:
my_stop_words = ['the','and','of','to','in','a','for','is','The','are','that','will','from','at','on','product','with','\n','be','were','by','This','was','this','or','These','over','other','used','use','files','as','file']

In [107]:
desc_non_stop_keys = desc_non_stop_keys[~desc_non_stop_keys.keys().isin(my_stop_words)]
desc_non_stop_keys.sort(ascending=False)
print desc_non_stop_keys 


              981
data           80
MODIS          27
global         19
contains       18
NASA           17
Data           17
based          16
Earth          16
daily          15
gridded        13
Phase          13
instrument     13
Aqua           12
design         12
...
modifications    1
modify           1
modular,         1
moisture,        1
momentum         1
monitoring       1
monitoring,      1
monoxide         1
months           1
model.           1
moons            1
moving           1
much             1
multi-channel    1
CMOS             1
Length: 2299, dtype: float64
