In [5]:
%load_ext Cython
import json
import pandas as pd
from os import listdir
from os.path import isfile, join
from functools import reduce


def get_paths(directory_path):
    '''
        input: directory that stores twitter data
        output: paths of all twitter json. e.g., 2016-06-21/2016-06-21:18:34:02
    '''
    paths = []
    for file_date in listdir(directory_path):
        if file_date[0]!='.':
            if file_date[0:1] =='2':
                for file_hour in listdir(join(directory_path,file_date)):
                    if file_hour[0]!='.':
                        file_hour = file_hour.split('.json')[0]  
                        paths.append(file_date+'/'+file_hour)
                    else:pass
            else:pass 
        else:pass 
    return paths
    
    
def parse_attr(datajson, attr):
    '''
        input: level0-level1-level2, e.g., 'id', 'retweeted_status-created_at','retweeted_status-user-statuses_count'
        output: attr's list
        attention: attr-'entities-hashtags' has special structure
    '''
    attr_split = attr.split('-')
    
    if len(attr_split)==1:
        return datajson[attr_split[0]]
    elif len(attr_split)==2:   
        # deal with some special cases, e.g., entities-hashtags
        values = datajson[attr_split[0]][attr_split[1]]
        if(attr=='entities-hashtags'):
            hashtags = [hashtag['text'] for hashtag in values]
            if(len(hashtags)!=0):
                return reduce((lambda x,y: x+', '+y),hashtags )
            else:
                return ''
        else:    
            return values   
    elif len(attr_split)==3:
        return datajson[attr_split[0]][attr_split[1]][attr_split[2]]
    
    else:pass
    
    
paths = get_paths('../twitter')
for path in paths:
    with open('../twitter/'+path+'.json', 'r+') as r:
        data = []
        columns = ['id','geo','text',
                   'entities-hashtags',
                   'created_at',
                   'retweeted_status-created_at',
                   'retweeted_status-retweet_count',
                   'retweeted_status-favorite_count',
                   'retweeted_status-user-friends_count',
                   'retweeted_status-user-favourites_count',
                   'retweeted_status-user-statuses_count',
                   'retweeted_status-user-followers_count',
                   'user-friends_count','user-favourites_count','user-statuses_count',
                   'user-geo_enabled','user-followers_count','user-location']
        
        for index, line in enumerate(r.readlines()):
            data.append([]) #i.e., create data[index] = []
            d = json.loads(line) 
            for attr in columns:
                try:
                    attr_value = parse_attr(d,attr)
                except KeyError:
                    attr_value = 'NULL'
                data[index].append(attr_value)
                
                
        # build json that is used for pd.read_json function
        json_for_pd = dict()
        json_for_pd['columns'] = columns
        json_for_pd['index'] = [i for i in range(len(data))]
        json_for_pd['data'] = data
        # write json
        with open('../cleaned_twitter/'+path[:11]+'c-'+path[11:].replace('/','-').replace(':','-')+'.json','w+') as w:
            w.write(json.dumps(json_for_pd))
        print('Finish: {}'.format(path))
        
print('Finish ALL')        


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython
2016-06-21/2016-06-21:18:34:02
Finish: 2016-06-21/2016-06-21:18:34:02
2016-06-21/2016-06-21:16:34:01
Finish: 2016-06-21/2016-06-21:16:34:01
2016-06-21/2016-06-21:21:34:01
Finish: 2016-06-21/2016-06-21:21:34:01
2016-06-21/2016-06-21:22:34:02
Finish: 2016-06-21/2016-06-21:22:34:02
2016-06-21/2016-06-21:07:34:01
Finish: 2016-06-21/2016-06-21:07:34:01
2016-06-21/2016-06-21:09:34:02
Finish: 2016-06-21/2016-06-21:09:34:02
2016-06-21/2016-06-21:13:34:01
Finish: 2016-06-21/2016-06-21:13:34:01
2016-06-21/2016-06-21:23:34:01
Finish: 2016-06-21/2016-06-21:23:34:01
2016-06-21/2016-06-21:14:34:01
Finish: 2016-06-21/2016-06-21:14:34:01
2016-06-21/2016-06-21:08:34:01
Finish: 2016-06-21/2016-06-21:08:34:01
2016-06-21/2016-06-21:19:34:01
Finish: 2016-06-21/2016-06-21:19:34:01
2016-06-21/2016-06-21:20:34:02
Finish: 2016-06-21/2016-06-21:20:34:02
2016-06-21/2016-06-21:05:34:01
Finish: 2016-06-21/2016-06-21:05:34:01
2016-06-21

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_json('../cleaned_twitter/2016-06-21/c-2016-06-21-18-34-02.json', orient='split')

print(df.head(5))



                   id   geo  \
0  745308821631995904  None   
1  745308821690683393  None   
2  745308824224030720  None   
3  745308824408563713  None   
4  745308824731529216  None   

                                                text  \
0  RT @gupta_james: Lily Allen called me and 30 m...   
1  RT @Fight4UK: #Brexit ~ #GO ~ #LeaveEU ~ #Vote...   
2  RT @me4ukip: Another reason to #VoteLeave http...   
3  RT @ohchrisburton: Butterflies only live for t...   
4  RT @Jade_S97: Is there an award for the most v...   

                              entities-hashtags          created_at  \
0                                     VoteLeave 2016-06-21 17:34:04   
1  Brexit, GO, LeaveEU, VoteLeave, BetterOffOut 2016-06-21 17:34:04   
2                                     VoteLeave 2016-06-21 17:34:05   
3                                               2016-06-21 17:34:05   
4                        VoteLeave, TakeControl 2016-06-21 17:34:05   

      retweeted_status-created_at retweeted_statu