# Pull S3

In [None]:
# mkdir ../data/tweets/ethereum/csv
# mkdir ../data/tweets/bitcoin/csv

In [None]:
# ! aws s3 cp s3://jeroens-bucket/ethereum ../data/tweets/ethereum/csv --recursive --profile personal 

In [None]:
# ! aws s3 cp s3://jeroens-bucket/bitcoin ../data/tweets/bitcoin/csv --recursive --profile personal 

# Spark it up!

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                        .master("local[8]")\
                        .config('spark.executor.memory', '5g')\
                        .config('spark.driver.memory', '5g')\
                        .appName("Tweet wrangeling")\
                        .config('spark.local.dir','~/.spark_tmp/')\
                        .getOrCreate()

In [2]:
spark

# CSV to Parquet

## Eth Tweets

In [3]:
csv_eth_path_1 = "../data/tweets/ethereum/csv/16-06-2018/"
csv_eth_path_2 = "../data/tweets/ethereum/csv/17-06-2018/"
csv_eth_path_3 = "../data/tweets/ethereum/csv/19-06-2018/"

parquet_eth_path = "../data/tweets/ethereum/parquet/"

In [4]:
eth_raw_1 = spark.read.csv(csv_eth_path_1,sep=";",header=True)
eth_raw_2 = spark.read.csv(csv_eth_path_2,sep=";",header=True)
eth_raw_3 = spark.read.csv(csv_eth_path_3,sep=";",header=True)

In [6]:
eth_raw = eth_raw_1.union(eth_raw_2)\
                    .union(eth_raw_3)\
                    .distinct()

In [7]:
from pyspark.sql.functions import col, to_timestamp

In [8]:
eth = eth_raw.select(
    "username",
    to_timestamp("date").alias("datetime"),
    "text",
    col("retweets").cast("INT"),
    col("favorites").cast("INT"),
    "geo",
    "mentions",
    "hashtags",
    "id",
    "permalink"
)

In [9]:
eth_raw.printSchema()

root
 |-- username: string (nullable = true)
 |-- date: string (nullable = true)
 |-- retweets: string (nullable = true)
 |-- favorites: string (nullable = true)
 |-- text: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)



In [10]:
eth.printSchema()

root
 |-- username: string (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- retweets: integer (nullable = true)
 |-- favorites: integer (nullable = true)
 |-- geo: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)



In [11]:
eth.write.mode("overwrite").parquet(parquet_eth_path)

## BTC Tweets

In [5]:
csv_btc_path_1 = "../data/tweets/bitcoin/csv/16-06-2018/"
csv_btc_path_2 = "../data/tweets/bitcoin/csv/18-06-2018/"
csv_btc_path_3 = "../data/tweets/bitcoin/csv/20-06-2018/"
csv_btc_path_4 = "../data/tweets/bitcoin/csv/22-06-2018/"


parquet_btc_path = "../data/tweets/bitcoin/parquet/16-06-2018/"


In [6]:
btc_raw_1 = spark.read.csv(csv_btc_path_1,sep=";",header=True)
btc_raw_2 = spark.read.csv(csv_btc_path_2,sep=";",header=True)
btc_raw_3 = spark.read.csv(csv_btc_path_3,sep=";",header=True)
btc_raw_4 = spark.read.csv(csv_btc_path_4,sep=";",header=True)

In [7]:
btc_raw = btc_raw_1\
                .union(btc_raw_2)\
                .union(btc_raw_3)\
                .union(btc_raw_4)\
                .distinct()

In [8]:
from pyspark.sql.functions import col, to_timestamp

In [9]:
btc = btc_raw.select(
    "username",
    to_timestamp("date").alias("datetime"),
    "text",
    col("retweets").cast("INT"),
    col("favorites").cast("INT"),
    "geo",
    "mentions",
    "hashtags",
    "id",
    "permalink"
)

In [10]:
btc_raw.printSchema()

root
 |-- username: string (nullable = true)
 |-- date: string (nullable = true)
 |-- retweets: string (nullable = true)
 |-- favorites: string (nullable = true)
 |-- text: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)



In [11]:
btc.printSchema()

root
 |-- username: string (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- retweets: integer (nullable = true)
 |-- favorites: integer (nullable = true)
 |-- geo: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)



In [12]:
btc.limit(5).toPandas()

Unnamed: 0,username,datetime,text,retweets,favorites,geo,mentions,hashtags,id,permalink
0,charminedal,2018-05-26 23:52:00,Analyst Says Bitcoin May Drop to $5500 Before ...,0,0,,,,1000525067426586624,https://twitter.com/charminedal/status/1000525...
1,GeraldineDuplat,2018-05-26 23:46:00,VIDEO: ¿Cómo funciona BITCOIN ? #Criptomonedas...,0,1,,,#Criptomonedas,1000523706312032258,https://twitter.com/GeraldineDuplat/status/100...
2,EffektivesB,2018-05-26 23:44:00,Woher hat der Bitcoin König seine Bitcoins ? W...,0,0,,,,1000523096091217921,https://twitter.com/EffektivesB/status/1000523...
3,Crypto_Popo,2018-05-26 23:41:00,The New York Stock Exchange Has its Eyes on Tr...,0,0,,,#bitcoin #blockchain #ico #airdrop #crypto #et...,1000522393092321280,https://twitter.com/Crypto_Popo/status/1000522...
4,MiddleOfMayhem,2018-05-26 23:27:00,How would having Bitcoin immediately benefit me?,0,0,,,,1000518930862133248,https://twitter.com/MiddleOfMayhem/status/1000...


In [13]:
btc.count()

26208300

In [14]:
btc.write.mode("overwrite").parquet(parquet_btc_path)