# Reading the Json Files & Data Mining

The tweets were saved on 73 json files with sizes ranging from 100 MB to ~19 GBs. All the files were broken to pieces of 100,000 lines (~ 700 MB) so that they can be easily processed on my PC.
I process the json files individually and save the resulting dataframes in CSV format. These CSV files are significantly smaller (~ 3-8 MB) and will be concatenated later.

I have cetegorized the tweets to tweets and retweets and saved them in separate CSV files. Retweet dataframes contain information about the retweet and the original tweet. The final tweet csv file is ~570 MB and contains ~1.7 million tweets. The final retweet csv file is ~1.4 GB and contains ~2.9 million retweets. 


In [None]:
import simplejson as json
import pandas as pd
from glob import glob
import numpy as np


# function to extract user_mentions from user_mentions list of dictionaries
userextract = lambda x: [y['screen_name'] for y in x] if len(x)> 0 else []
# function to extract hashtags from hashtags list of dictionaries
hashextract = lambda x: [y['text'] for y in x] if len(x)> 0 else []

# function to extract extended state, user-mentions, hashtags, and full-texts
def multi_functions(tweet):
    if 'extended_tweet' in tweet.keys():
        return [ True, userextract(tweet['extended_tweet']['entities']['user_mentions']), \
                hashextract(tweet['extended_tweet']['entities']['hashtags']), tweet['extended_tweet']['full_text']]
    else:
        return [False, userextract(tweet['entities']['user_mentions']),\
                hashextract(tweet['entities']['hashtags']), tweet['text']]

# csv file to keep track of the number of the duplicate tweets, change 1000 if you have more than 1000 files  
tweet_counter = pd.DataFrame(index=np.arange(1000), \
                             columns=['filename', 'recorded_tweets', 'recorded_retweets', 'tweet_counts', 'retweet_count'])

# using glob to read the file names
filenames = glob('data/blackfriday*.json')
# or your own list
#filenames = ['blackfriday46_0.json']

In [None]:
i = 0
tweets_data = []
retweets_data = []
data = []

# reading the json files
for filename in filenames:
    tweets_file = open(filename, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            data.append(tweet)
        except:
            continue

    tweets_file.close()

    #############################################################################################

    # dividing the tweets to tweets & retweets categories
    for tweet in data:
        if 'text' in tweet.keys(): # filter error lines 
            if 'retweeted_status' in tweet.keys():
                retweets_data.append(tweet)
            else:
                tweets_data.append(tweet)
          
    data = []
    
    tweet_counter.iloc[i, 0] = filename
    tweet_counter.iloc[i, 1] = len(tweets_data)
    tweet_counter.iloc[i, 2] = len(retweets_data)

    #############################################################################################

    # building a tweets dataframe & saving it in csv format
    tweets = pd.DataFrame()
    tweets['id'] = [tweet['id'] for tweet in tweets_data]
    tweets['created_at'] = [tweet['created_at'] for tweet in tweets_data]
    tweets['language'] = [tweet['lang'] for tweet in tweets_data]
    tweets['coordinates'] = [tweet['coordinates'] for tweet in tweets_data]

    tweets['mixed'] = [multi_functions(tweet) for tweet in tweets_data]
    tweets['extended'] = tweets['mixed'].apply(lambda x: x[0])
    tweets['user_mentions'] = tweets['mixed'].apply(lambda x: x[1])
    tweets['hashtags'] = tweets['mixed'].apply(lambda x: x[2])
    tweets['full_text'] = tweets['mixed'].apply(lambda x: x[3])
    tweets['full_text'] = tweets['full_text'].apply(lambda x: [x])
    tweets.drop('mixed', axis=1, inplace=True)

    tweets['user_name'] = [tweet['user']['screen_name'] for tweet in tweets_data]
    tweets['verified'] = [ True if tweet['user']['verified'] is True else False for tweet in tweets_data]
    tweets['user_followers'] = [tweet['user']['followers_count'] for tweet in tweets_data]
    tweets['user_friends'] = [tweet['user']['friends_count'] for tweet in tweets_data]
    tweets['user_favourites_count'] = [tweet['user']['favourites_count'] for tweet in tweets_data]
    tweets['user_listed_count'] = [tweet['user']['listed_count'] for tweet in tweets_data]
    tweets['user_statuses_count'] = [tweet['user']['statuses_count'] for tweet in tweets_data]
    tweets['user_created_at'] = [tweet['user']['created_at'] for tweet in tweets_data]
    tweets['retweet'] = False
    
    tweets = tweets.drop_duplicates('id')
    
    filename_tweets = 'output/tweets'+str(i)
    tweets.to_csv(filename_tweets, index=False)
    tweet_counter.iloc[i, 3] = len(tweets)
    tweets_data = []
    tweets = pd.DataFrame()
    
    #############################################################################################
    
    # building a retweets dataframe & saving it in csv format
    retweets = pd.DataFrame()
    retweets['id'] = [tweet['id'] for tweet in retweets_data]
    retweets['created_at'] = [tweet['created_at'] for tweet in retweets_data]
    retweets['language'] = [tweet['lang'] for tweet in retweets_data]
    retweets['coordinates'] = [tweet['coordinates'] for tweet in retweets_data]

    retweets['mixed'] = [multi_functions(tweet['retweeted_status']) for tweet in retweets_data]
    retweets['extended'] = retweets['mixed'].apply(lambda x: x[0])
    retweets['user_mentions'] = retweets['mixed'].apply(lambda x: x[1])
    retweets['hashtags'] = retweets['mixed'].apply(lambda x: x[2])
    retweets['full_text'] = retweets['mixed'].apply(lambda x: x[3])
    retweets['full_text'] = retweets['full_text'].apply(lambda x: [x])
    retweets.drop('mixed', axis=1, inplace=True)

    retweets['user_name'] = [tweet['user']['screen_name'] for tweet in retweets_data]
    retweets['user_followers'] = [tweet['user']['followers_count'] for tweet in retweets_data]
    retweets['user_friends'] = [tweet['user']['friends_count'] for tweet in retweets_data]
    retweets['user_favourites_count'] = [tweet['user']['favourites_count'] for tweet in retweets_data]
    retweets['user_listed_count'] = [tweet['user']['listed_count'] for tweet in retweets_data]
    retweets['user_statuses_count'] = [tweet['user']['statuses_count'] for tweet in retweets_data]
    retweets['user_created_at'] = [tweet['user']['created_at'] for tweet in retweets_data]

    retweets['retweet'] = True
    retweets['orig_id'] = [tweet['retweeted_status']['id'] for tweet in retweets_data]
    retweets['orig_created_at'] = [tweet['retweeted_status']['created_at'] for tweet in retweets_data]
    retweets['orig_user'] = [tweet['retweeted_status']['user']['screen_name'] for tweet in retweets_data]
    retweets['verified'] = [ True if tweet['retweeted_status']['user']['verified'] is True else False for tweet in retweets_data]
    retweets['orig_user_followers'] = [tweet['retweeted_status']['user']['followers_count'] for tweet in retweets_data]
    retweets['orig_user_friends'] = [tweet['retweeted_status']['user']['friends_count'] for tweet in retweets_data]
    retweets['orig_user_favourites'] = [tweet['retweeted_status']['user']['favourites_count'] for tweet in retweets_data]
    retweets['orig_user_listed'] = [tweet['retweeted_status']['user']['listed_count'] for tweet in retweets_data]
    retweets['orig_user_statuses'] = [tweet['retweeted_status']['user']['statuses_count'] for tweet in retweets_data]
    retweets['orig_user_created_at'] = [tweet['retweeted_status']['user']['created_at'] for tweet in retweets_data]

    retweets = retweets.drop_duplicates('id') 

    filename_retweets = 'output/retweets'+str(i)
    retweets.to_csv(filename_retweets, index=False)
    tweet_counter.iloc[i, 4] = len(retweets)
    retweets_data = []
    retweets = pd.DataFrame()
    
    # print(i)
    i +=1

# saving the tweet counter data
tweet_counter.to_csv('tweet_counter.csv')


# Concatanation
*This section can be run independent of the above section*

In [None]:
import pandas as pd
from glob import glob

In [None]:
filenames_tweets = glob('output/tweets*')

list_tweets = []

for filename in filenames_tweets:
    df_chunk = pd.read_csv(filename)
    list_tweets.append(df_chunk)

# Free up the memory
df_chunk = pd.DataFrame()

df_tweets = pd.concat(list_tweets, axis= 0)
df_tweets = df_tweets.drop_duplicates('id')

list_tweets = []
df_tweets.to_csv('df_tweets.csv', index=False)

In [1]:
df_tweet.head()

Unnamed: 0,id,created_at,language,coordinates,extended,user_mentions,hashtags,full_text,user_name,verified,user_followers,user_friends,user_favourites_count,user_listed_count,user_statuses_count,user_created_at,retweet
0,932568009427816449,Mon Nov 20 11:15:14 +0000 2017,es,,True,['Pimkie_ES'],"['descuento', 'BlackFriday', 'ccrosaleda']","['Los días 23, 24 y 25 de Noviembre disfruta d...",ccrosaleda,False,11108,10546,1346,77,9373,Wed Nov 16 09:34:33 +0000 2011,False
1,932568012443602945,Mon Nov 20 11:15:15 +0000 2017,es,,True,[],['BlackFriday2017'],['CHOLLO #BlackFriday2017 🖤 FIFA 18 Edición Es...,soydechollos,False,6001,48,539,145,12642,Thu Apr 23 18:58:21 +0000 2015,False
2,932568026246975488,Mon Nov 20 11:15:18 +0000 2017,en,,False,[],"['AMAZON', 'DEALS', 'Christmas', 'holiday', 't...",['HURRY #AMAZON LIGHTNING #DEALS LIVE &gt; htt...,CouponsFreebie,False,54691,11508,101,667,42325,Fri Oct 24 03:28:08 +0000 2008,False
3,932568026494283776,Mon Nov 20 11:15:18 +0000 2017,en,,True,['blackfriday'],"['BlackFriday', 'CORSETS', 'dress', 'fashion']",['Black Friday Sale- 55% Off\nNayla Brocade Ov...,corsetsqueen,False,216,1592,0,1,1133,Tue Sep 27 00:55:29 +0000 2011,False
4,932568027010404354,Mon Nov 20 11:15:18 +0000 2017,en,,True,[],"['ghd', 'Christmas', 'hair', 'BlackFriday', 'B...",['Black Friday Deals\nSave £20 on the ghd IV S...,TerencePaulShop,False,6929,594,256,30,4721,Mon Nov 19 07:39:50 +0000 2012,False


In [None]:
# Free up the memory if you need to
df_tweets = pd.DataFrame()

In [None]:
filenames_retweets = glob('output/retweets*')

list_retweets = []

for filename in filenames_retweets:
    df_chunk = pd.read_csv(filename)
    list_retweets.append(df_chunk)

# Free up the memory
df_chunk = pd.DataFrame()

df_retweets = pd.concat(list_retweets, axis= 0)
df_retweets = df_retweets.drop_duplicates('id')

list_retweets = []
df_retweets.to_csv('df_retweets.csv', index=False)

In [2]:
df_retweets.head()

Unnamed: 0,id,created_at,language,coordinates,extended,user_mentions,hashtags,full_text,user_name,verified,...,retweet,orig_id,orig_created_at,orig_user,orig_user_followers,orig_user_friends,orig_user_favourites,orig_user_listed,orig_user_statuses,orig_user_created_at
0,932568005795614720,Mon Nov 20 11:15:13 +0000 2017,en,,True,[],"['BlackFriday2017', 'BlackFridayFeeling', 'Bla...",['Racism will never end while cheap and discou...,dalisufakude,False,...,True,932527666804256768,Mon Nov 20 08:34:55 +0000 2017,Amina_Diallo1,44,444,45,0,170,Mon Aug 15 13:31:28 +0000 2011
1,932568007938887680,Mon Nov 20 11:15:14 +0000 2017,es,,True,['DecathlonES'],['BlackFriday'],['Algunos listillos quieren aprovechar el #Bla...,retenex,True,...,True,932540839582162945,Mon Nov 20 09:27:16 +0000 2017,policia,3049485,0,25781,7875,22662,Wed Mar 11 17:02:34 +0000 2009
2,932568008790298630,Mon Nov 20 11:15:14 +0000 2017,fr,,True,"['cheissoux', 'franceinter']","['BlackFriday', 'obsolescence', 'GreenFriday']",['Stop au #BlackFriday &amp; à l\'#obsolescenc...,MathieuRama24,False,...,True,932564925699981312,Mon Nov 20 11:02:59 +0000 2017,Envie_org,1646,626,421,100,1574,Thu Jul 31 11:07:33 +0000 2014
3,932568013274058752,Mon Nov 20 11:15:15 +0000 2017,es,,True,[],"['BlackFriday', 'BlackFriday']","['Ah! pero espera, que esto MOLA MIL!! Sabes q...",bimbayorkshire,False,...,True,932565511086444544,Mon Nov 20 11:05:18 +0000 2017,JaponShop,19033,1932,8722,183,23708,Fri Oct 30 00:14:09 +0000 2009
4,932568013651480577,Mon Nov 20 11:15:15 +0000 2017,en,,True,[],"['BlackFriday2017', 'BlackDiamond', 'HGJG', 'L...",['On #BlackFriday2017 you could win #BlackDiam...,Hannytravels,False,...,True,932564465010331648,Mon Nov 20 11:01:09 +0000 2017,HGJG_VIP,238,135,12,0,127,Wed Apr 05 08:43:29 +0000 2017


In [None]:
# Free up the memory if you need to
df_retweets = pd.DataFrame()