In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

import json
import os

In [None]:
pd.__version__

In [None]:
def load_tweets(path):
   
    with open(path, "r") as f:
        example_tweets = json.load(f)
        
    return example_tweets

Grab Invoca's tweets

In [None]:
tweets = load_tweets('tweetDemo.json')
print "Number of tweets downloaded:", len(tweets)

Tweets come as a list of nested dictionaries -- a JSON. These are great for giving context to data (metadeta).

In [None]:
tweets[0]

In [None]:
tweets[0].keys()

In [None]:
for i in range(5):
    print tweets[i]['full_text'] + '\n'    

In [None]:
tweets[0]['entities']

In [None]:
tweets[0]['user']

In [None]:
df = pd.read_json('tweetDemo.json')
df.head(5)

In [None]:
df.info()

In [None]:
overview = df.describe()
overview

In [None]:
df.columns

In [None]:
df.head(1)

In [None]:
df['contributors'].isna().head()

In [None]:
nanThreshhold = 0.5

columnstoKeep = [col for col in df.columns if np.mean(df[col].isna()) < nanThreshhold]
columnstoKeep

To make code predictable, make copies of your dataframe if you're going to work on some subset of them.

In [None]:
truncatedDf = df.copy().loc[:, columnstoKeep]
print truncatedDf.shape

truncatedDf.head()

In [None]:
assert truncatedDf['id'].value_counts().mean() == 1
assert truncatedDf['id_str'].value_counts().mean() == 1

# or, simply,
assert truncatedDf['id'].is_unique
assert truncatedDf['id_str'].is_unique

# if these assert statements get triggered, use df.drop_duplicates() on the primary key columns

In [None]:
truncatedDf['source'].value_counts()

Use https://pythex.org/ to fiddle with regex pattern

Pandas Series string methods: https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html

In [None]:
sourcePattern = r'>(.*)<'

cleanedTag = truncatedDf.loc[:, 'source'].str.extract(sourcePattern, expand=False)

cleanedTag.head()

Commentary on setting copy warning: https://www.dataquest.io/blog/settingwithcopywarning/
As well as the primary documentation at: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

If you're doing any transformation of the dataframe at all,
- use loc[row label, column] when setting values
- work on copies of your df if you're using a subset.


<img src = 'setCopyWarning.png' >

In [None]:
# dont do this: truncatedDf['parsedSource'] = cleanedTag

truncatedDf.loc[:, 'parsedSource'] = truncatedDf.loc[:, 'source'].str.extract(sourcePattern, expand=False)

truncatedDf = truncatedDf.drop(['source', 'id', 'id_str', 'lang', 
                                'possibly_sensitive', 'is_quote_status', 
                                'display_text_range', 'truncated', 'retweeted'], axis = 1)
truncatedDf.head()

In [None]:
truncatedDf['parsedSource'].value_counts()

In [None]:
%%timeit
# benefits of query are only pronounced at > 200k rows
truncatedDf.query("parsedSource == 'Twitter Web Client' ")

In [None]:
%%timeit
# this is fine for looking through data or doing calculations
truncatedDf[truncatedDf['parsedSource'] == 'Twitter Web Client']

In [None]:
%%timeit
# this is the safest way to filter for production
truncatedDf.loc[truncatedDf['parsedSource'] == 'Twitter Web Client']

In [None]:
%%timeit
truncatedDf.loc[[src == 'Twitter Web Client' for src in truncatedDf['parsedSource'] ]]

We expect there to be 261 True values when we select for iPhone or Android since there is 241 + 20 of these values in the value_counts call above

In [None]:
phoneOnly = (truncatedDf['parsedSource'] == 'Twitter for iPhone') | (truncatedDf['parsedSource'] == 'Twitter for Android')
assert phoneOnly.sum() == 261

In [None]:
phones = truncatedDf.copy().loc[phoneOnly]
phones.head()

In [None]:
phones = phones.reset_index(drop=True)
phones.head()

Accessing data: loc and iloc are only the same when the index is 0, 1, 2, ...

Otherwise loc references the given index, while iloc uses the order of entries (always 0, 1, 2).

In [None]:
phones.loc[[1,3,4]]

In [None]:
phones.iloc[[1,3,4]]

In [None]:
plt.hist(phones.loc[:, 'created_at'].dt.hour, bins=np.arange(25))
plt.xticks(np.arange(25))
plt.title('Invoca Tweets from Cell Phones by Hour');

This plot doesn't make sense -- convert datetime objects to our time zone (Twitter API defaults to UTC -- thanks Google!)

For a review of time series methodology in Pandas, see: https://www.kaggle.com/thebrownviking20/everything-you-can-do-with-a-time-series

In [None]:
phones.loc[:, 'created_at'] = phones.loc[:, 'created_at'].astype('datetime64[ns]') 

phones.loc[:, 'correctedTime'] = (
                                    phones.loc[:, 'created_at'].dt.tz_localize('UTC')
                                                               .dt.tz_convert('PST8PDT')
                                 )

phones.loc[:, 'hour'] = phones.loc[:, 'correctedTime'].dt.hour
phones.loc[:, 'month'] = phones.loc[:, 'correctedTime'].dt.month
phones.loc[:, 'day'] = phones.loc[:, 'correctedTime'].dt.day
phones.loc[:, 'minute'] = phones.loc[:, 'correctedTime'].dt.minute

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='day', data=phones, hue='parsedSource', )
plt.title('Invoca Tweets from Cell Phones by Day of Month');

see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html

In [None]:
pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo')

What kind of metadeta accompanies these counts?

In [None]:
phones['entities'][10]

We can extract the names of mentioned people and groups by looping over our JSON like any other python dictionary

In [None]:
def extractMentions(entities, remove = ['invoca']):
    
    mentions = entities['user_mentions']
    numMentions = len(mentions)
    
    names = []
    
    for i in range(numMentions):
        name = mentions[i]['name'].lower()
        
        if name not in remove:
            names += [name]
            
    return names

In [None]:
extractMentions(phones['entities'][10])

In [None]:
phones.loc[:, 'mentions'] = phones.loc[:, 'entities'].apply(extractMentions)
phones.head(2)

In [None]:
# the sum of a series of lists is just one big list
allMentions = phones['mentions'].sum()
print phones['mentions'][:3]
print
print allMentions[:5]

# casting the big list as a series allows us to use pandas functionality
pdMentions = pd.Series(allMentions)
mentionCounts = pdMentions.value_counts()

filteredMentions = mentionCounts[mentionCounts > 2]
filteredMentions

In [None]:
plt.figure(figsize=(15,10))

sns.barplot(x=filteredMentions.index, y=filteredMentions)
plt.title('Twitter Mentions for Posts Made from Cell Phones')
plt.xticks(rotation = 45);

Cool. How we do examine the inherent strata within our dataframe? groupby is great place to start.

(documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html)

In [None]:
phones.groupby(['month']).first()

In [None]:
lengths = phones.groupby(['month']).agg(lambda x: len(x) + np.random.normal(scale=20))
lengths

In [None]:
phones.groupby('hour')['minute'].mean().plot(kind='bar')

In [None]:
phones.groupby(['month', 'day']).count().loc[[2, 4], :, :]

In [None]:
phones.groupby(['month', 'day']).count().loc[(2, 6)]

In [None]:
phones.groupby(['month']).agg([np.mean, np.std,lambda x: np.sum(x**2)])

In [None]:
phones.T

In [None]:
usefulDF = phones.groupby(['month', 'day']).count().loc[[2, 4], :, :].T
usefulDF

In [None]:
usefulDF.to_pickle('savedData.pickle')

In [None]:
!ls

In [None]:
pd.read_pickle('savedData.pickle')

In [None]:
lengths

In [None]:
lengths.corr()

In [None]:
lengths.cov()

In [None]:
sns.heatmap(lengths.corr())

Thanks to https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html
for profiling sample code.

"conda install line_profiler" in your environment to utilize the code profiler

In [None]:
%load_ext line_profiler

In [None]:
testMention = {u'hashtags': [],
                u'symbols': [],
               u'urls': [{u'display_url': u'mtech.today/2DsAqjr',
               u'expanded_url': u'https://mtech.today/2DsAqjr',
               u'indices': [97, 120],
               u'url': u'https://t.co/rwauvWY2gy'}],
               
 u'user_mentions': [
     
      {u'id': 747815998531768324L,
       u'id_str': u'747815998531768324',
       u'indices': [3, 17],
       u'name': u'MarTech Today',
       u'screen_name': u'martech_today'},
     
     
      {u'id': 1067019368,
       u'id_str': u'1067019368',
       u'indices': [83, 96],
       u'name': u'Barry Levine',
       u'screen_name': u'xBarryLevine'}
 
 ]}

In [None]:
%lprun -f extractMentions extractMentions(testMention)

"conda install memory_profiler" in your environment to analyze memory usage of imported scripts

In [None]:
%load_ext memory_profiler

In [None]:
%%file extractMentions.py
def extractMentionsScript(entities, remove = ['invoca']):    
    
    mentions = entities['user_mentions']
    numMentions = len(mentions)
    
    names = []
    
    for i in range(numMentions):
        name = mentions[i]['name'].lower()
        
        if name not in remove:
            names += [name]
            
    return names

In [None]:
from extractMentions import extractMentionsScript

In [None]:
%mprun -f extractMentionsScript extractMentionsScript(testMention)