In [1]:
import pyspark
from pyspark.sql import SQLContext

In [2]:
try:
    sqlContext = SQLContext(sc)
except:
    sc = pyspark.SparkContext('local[*]')
    sqlContext = SQLContext(sc)

# Choose Filename Here

In [3]:
filename = "file:///home/bdm/twitter.large"
#filename = "twitter.large.1"
dataset = sqlContext.read.json(filename)

In [4]:
from pyspark.sql.functions import *

# Convert short month name to number
month_names = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug", "Sep","Oct","Nov","Dec"]
month_numbers = []
for n in range(1,10):
    month_numbers.append ( "0" + str(n))
    
for n in range(10,13):
    month_numbers.append ( str(n))
    
month_dic = {key:value for key, value in zip(month_names, month_numbers)}

def toDate(stringAsDate): 
    weekday, month, day, time, tz, year = stringAsDate.split()
    return "-".join([year, month_dic.get(month, "NF"), day])

def interaction(tweet):
    return tweet

from pyspark.sql.types import StringType

# This was supposed to mean I could ccall direct from SQL
# Did not work for me
udfToDate = udf(toDate)
dataset.registerTempTable("tweets")

interaction_sql = "retweeted_status.favorite_count > 0 or retweeted_status.retweet_count > 0"

all_tweets = dataset.select( udfToDate(dataset.created_at).alias ("Date"),\
              ).groupBy("Date").count().registerTempTable("allTweets")

# Tried quotes, parentheses and column brackets but can't access count
# registerTempTable solved the problem
dataset.filter(interaction_sql)\
       .select( udfToDate(dataset.created_at).alias ("Date"),\
                dataset.retweeted_status.favorite_count.alias("Favourites"),\
                dataset.retweeted_status.retweet_count.alias("Retweets")\
              )\
       .groupBy("Date").agg({"Favourites": "count", "Retweets": "count"})\
        .withColumnRenamed("count(Favourites)", "Favourites")\
        .withColumnRenamed("count(Retweets)", "Retweets").registerTempTable("allInteractions")

Create two temp tables  
  
allTweets -> tweets grouped and counted by their date  
allInteractions -> similar to allTweets but has follower and retweet data

In [5]:
#print help(dataset.alias)

sqlContext.sql(
  """SELECT year(Date) AS Year, month(Date) AS Month, dayofmonth(Date) AS Day, count AS COUNT
     FROM allTweets
     """).registerTempTable("dailyTweets")


sqlContext.sql(
  """SELECT year(Date) AS Year, month(Date) AS Month, dayofmonth(Date) AS Day,
     Favourites , Retweets
     FROM allInteractions""").registerTempTable("dailyInteractions")

For allTweets and allInteractions splt the date into year, month, and day

*This does not work for dailyInteractions because I could not figure out how to access a column with a function name. In regular SQL you would use [] but that does not seem to work*

# In which month we collected the most interactions?

In [6]:
sqlContext.sql(
  """SELECT Year, Month, SUM(Favourites) + SUM(Retweets) AS Interactions
     FROM dailyInteractions
     Group By Year, Month
     Order By Interactions DESC
     """).show()



#count('count(Favourites)') AS Favourites, count('count(Retweets)') AS Retweets

+----+-----+------------+
|Year|Month|Interactions|
+----+-----+------------+
|2013|    7|         544|
|2013|    5|         304|
|2013|    6|         276|
|2013|    8|         260|
|2013|    3|         254|
|2013|    4|         236|
|2013|    9|         100|
|2013|    2|           2|
+----+-----+------------+



In [7]:
def query( interactions = False, Year=None, Month=None, Day=None ):
    # 8 combinations of inputs of which 3 are legal
    # 4 illegal involve year is None
    # Other is Year + Day
    if not year:
        raise valueError
        
    if not month and day:
        raise valueError
        
    items = zip(["Year","Month","Day"], [Year,Month,Day])
        
    params = ["{0}={1}".format(name,str(value)) for name, value in items if value]
    
    where = ""
    
    if params:
        where = "WHERE " + " AND ".join(params)
        
    table = "SUM(countsdailyTweets" 
    
    if not interactions:
        table = "SUM(COUNT) AS TotalTweets  FROM dailyTweets" 
    else:
        table= "SUM(Favourites) AS TotalFavs , SUM(Retweets) AS TotalRetweets FROM dailyInteractions"
    
    q = """
        SELECT {0}
        {1}
    """.format(table, where)
    
    print q
    sqlContext.sql(q).show()
        
query(Year = 2013 , Month = 3)

query(Year = 2013 , Month = 3, interactions = True)


        
    


        SELECT SUM(COUNT) AS TotalTweets  FROM dailyTweets
        WHERE Year=2013 AND Month=3
    
+-----------+
|TotalTweets|
+-----------+
|      15826|
+-----------+


        SELECT SUM(Favourites) AS TotalFavs , SUM(Retweets) AS TotalRetweets FROM dailyInteractions
        WHERE Year=2013 AND Month=3
    
+---------+-------------+
|TotalFavs|TotalRetweets|
+---------+-------------+
|      127|          127|
+---------+-------------+



• number of tweets per day, per month, per year  
• number of tweets with interactions (favourites or retweets) per day, per month,
per year

Build queries based on date and whether or not you want interactions

# Movie Analysis

In [8]:
# 1 favs and retweets
# 2 days of year
# 3 language

def all_film(tweet):
    name = tweet.entities.urls[0].expanded_url
    weekday, month, day, time, tz, year = tweet.created_at.split()
    return ((name,year,month_dic[month],day), 1)

def film(tweet):
    name = tweet.entities.urls[0].expanded_url
    retweet = tweet.retweet_count
    favs = tweet.favorite_count
    
    if not retweet:
        retweet = 0
        
    if not favs:
        favs = 0
    
    return (name, int(retweet) + int(favs))

def film_by_language(tweet):
    name = tweet.entities.urls[0].expanded_url
    language = tweet.lang
    
    return ((name, language), 1)


def reduce_tuple(tup, other):
    return tuple(a+b for a,b in zip(tup,other))

languages_by_film = dataset.map(film_by_language).countByKey()
all_films = dataset.map(all_film).countByKey()
film_engaement = dataset.map(film).countByKey()

languages_by_film -> just counts film tweets by language
all_films -> has follower and retweets but is not broken down by language

In [9]:
from collections import Counter

def print_items(items, string, limit = None):
    
    items = Counter({x[0][0]:x[1] for x in items})
    
    # I think if limit is None it should take all
    for index ,k,v in enumerate(items.most_common()[:limit]):
        print string.format(index), k, v
    

def films_by_year(date):
    y,m,d = date.split("-")
    filtered = filter(lambda x: x[0][1] == y, all_films.items())
    print_items(filtered, y)

def films_by_month(date):
    y,m,d = date.split("-")
    filtered = filter(lambda x: x[0][1] == y and x[0][2] == m, all_films.items())
    print_items(filtered, " ".join([y,m]))

def films_by_day(date):
    y,m,d = date.split("-")
    filtered = filter(lambda x: x[0][1] == y and x[0][2] == m and x[0][3] == d, all_films.items())
    print_items(filtered, " ".join([y,m,d]))
    
films_by_month("2013-03-08")
    

<type 'list'>
2013 03 http://www.imdb.com/title/tt1583421 34
2013 03 http://www.imdb.com/title/tt0903624 10
2013 03 http://www.imdb.com/title/tt2053463 9
2013 03 http://www.imdb.com/title/tt0443272 9
2013 03 http://www.imdb.com/title/tt2302755 9
2013 03 http://www.imdb.com/title/tt1024648 9
2013 03 http://www.imdb.com/title/tt1045658 9
2013 03 http://www.imdb.com/title/tt1623205 8
2013 03 http://www.imdb.com/title/tt0454876 7
2013 03 http://www.imdb.com/title/tt0434139 6
2013 03 http://www.imdb.com/title/tt1517260 6
2013 03 http://www.imdb.com/title/tt1904996 6
2013 03 http://www.imdb.com/title/tt1907668 6
2013 03 http://www.imdb.com/title/tt0089218 6
2013 03 http://www.imdb.com/title/tt0790628 5
2013 03 http://www.imdb.com/title/tt1564585 5
2013 03 http://www.imdb.com/title/tt1659337 5
2013 03 http://www.imdb.com/title/tt1041829 5
2013 03 http://www.imdb.com/title/tt1446192 5
2013 03 http://www.imdb.com/title/tt1931533 5
2013 03 http://www.imdb.com/title/tt2024432 5
2013 03 http://www

• number of tweets about every single movie, per day and per month
Queries to get films on a given day, month, or year

In [None]:
print_items(all_films.items, "{0} Most Popular Film", limit = 20)

In [10]:
for k,v in sorted(film_engaement.items(), lambda x,y: y[1] - x[1])[:5]:
    print k,v

http://www.imdb.com/title/tt0770828 1738
http://www.imdb.com/title/tt1300854 1738
http://www.imdb.com/title/tt1408101 1324
http://www.imdb.com/title/tt0816711 1256
http://www.imdb.com/title/tt1483013 1248


• amount of engagement a movie receives (favourites or retweets)  
5 films with the most interactions

In [11]:
sorted(filter(lambda x: x[0][1] == 'es', languages_by_film.items()), lambda x,y: y[1]-x[1])[0][0][0]

u'http://www.imdb.com/title/tt0770828'

The most popular spanish movie  
What is the most popular movie in the group of Spanish-speaking users (check for
es in lang field)?

# User Analysis

In [12]:
dataset.select(dataset.user.name.alias("Username"), dataset.user.followers_count.alias("Followers"), \
                    dataset.user.favourites_count.alias("Favourites"),\
                   dataset.user.statuses_count.alias("Statuses"),\
                   udfToDate(dataset.created_at).alias("Date")).registerTempTable("users")

In [13]:
sqlContext.sql("""select * from users""").show()

+-------------------+---------+----------+--------+----------+
|           Username|Followers|Favourites|Statuses|      Date|
+-------------------+---------+----------+--------+----------+
|มิสเตอร์บลูขอบายส์♧|      114|       679|   47133|2013-02-28|
|        Wade Carter|      151|       121|    9281|2013-02-28|
|Miroslav Mikuljanac|      201|       333|    1410|2013-02-28|
|        Zoltan Mora|       81|        23|     853|2013-02-28|
|        Zoltan Mora|       81|        23|     853|2013-02-28|
|          Jan Mares|       75|         0|     655|2013-02-28|
|        Zoltan Mora|       81|        23|     853|2013-02-28|
|     Bader Aljarbou|      137|        76|    8231|2013-02-28|
|      Paul W. Pasia|      215|        15|    1091|2013-02-28|
|             le MAK|      130|       135|   22372|2013-02-28|
|             le MAK|      130|       135|   22372|2013-02-28|
|             le MAK|      130|       135|   22372|2013-02-28|
|Jon Robert Arthur ❤|      843|       523|   23235|2013

number of followers, favourites, statuses, and listings of users

In [14]:
sqlContext.sql("""select u.*
from users u
Join
(
select  t.Username, max(Followers) AS maximum
from   users t
group by t.Username
order by maximum desc limit 1
)   p
On u.Username = p.Username
Order By u.Date DESC
Limit 1""").show()

+-----------+---------+----------+--------+----------+
|   Username|Followers|Favourites|Statuses|      Date|
+-----------+---------+----------+--------+----------+
|Ryan Butler|  1605806|        79|    7588|2013-07-20|
+-----------+---------+----------+--------+----------+



Latest update of the user with the most followers  
What are the most recent stats of a user with the most followers?