In [47]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

+---+----+------+-------------------+-----------+-----------+-----------+-----------+----------------+------------------+
|SNo|Name|Symbol|               Date|       High|        Low|       Open|      Close|          Volume|         Marketcap|
+---+----+------+-------------------+-----------+-----------+-----------+-----------+----------------+------------------+
|  1|Aave|  AAVE|2020-10-05 23:59:59|55.11235847|49.78789992|52.67503496|53.21924296|             0.0| 89128128.86084658|
|  2|Aave|  AAVE|2020-10-06 23:59:59|53.40227002|40.73457791|53.29196931|42.40159861|  583091.4597628| 71011441.25451232|
|  3|Aave|  AAVE|2020-10-07 23:59:59|42.40831364|35.97068975|42.39994711|40.08397561| 682834.18632335| 67130036.89981823|
|  4|Aave|  AAVE|2020-10-08 23:59:59|44.90251114|36.69605677|39.88526234|43.76446306|1658816.92260445|220265142.10956782|
|  5|Aave|  AAVE|2020-10-09 23:59:59|47.56953274| 43.2917758|43.76446306|46.81774415|  815537.6607835|235632208.16269898|
+---+----+------+-------

In [9]:
# Combining them all into a single csv file
import os
import glob
import pandas as pd
os.chdir("crypto_data")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ]).reset_index()
#export to csv
combined_csv.to_csv( "crypto_data.csv", index=False, encoding='utf-8-sig')

In [103]:
# Reading the combined data into spark object
df = sqlContext.read.csv('crypto_data/crypto_data.csv',header=True)

# Selecting the data within a time range
from datetime import datetime
def get_date_time_obj(date_time_str):
    date = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
    return date
min_date = get_date_time_obj('2018-07-01 00:00:00')
max_date = get_date_time_obj('2021-07-01 23:59:59')


# Use filterbyvalues to select those files in the range
filtered_rdd = df.rdd.filter(lambda x: get_date_time_obj(x[4])>=min_date and get_date_time_obj(x[4])<=max_date)
#print(filtered_dff.count())

# Map to name-marketcap key-value pair
market_cap_rdd = filtered_rdd.map(lambda x: (x[2],float(x[10])))

# Selecting the top 10 cryptos by market value
top10_by_avg_market_cap = market_cap_rdd.mapValues(lambda x: (x,1)) \
    .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \
    .mapValues(lambda y: y[0]/y[1]).sortBy(lambda x: (-x[1])).take(10)

top10_list = [x[0] for x in top10_by_avg_market_cap]
top10_list

# Filtering out only the top 10 cryptos
selected_crypto_rdd = filtered_rdd.filter(lambda x: x[2] in top10_list)
#selected_crypto_rdd.count()

In [104]:
# Further EDA
# Checking to see if data is present for all 1097 dates from 1st Jul 2018 to 1st Jul 2021
# counting by the number of entries for each date
value_count_for_each_date = selected_crypto_rdd.map(lambda x: (x[4],1)) \
                    .reduceByKey(lambda a,b: a+b).collect()
print(len(value_count_for_each_date))
# there are 1097 dates present. Voila!
# (can consider taking only those which are also present in the stock market data)
# a peak into the counts
value_count_for_each_date[:5]

1097


[('2020-08-21 23:59:59', 9),
 ('2020-08-25 23:59:59', 9),
 ('2020-08-26 23:59:59', 9),
 ('2020-08-31 23:59:59', 9),
 ('2020-09-01 23:59:59', 9)]

In [105]:
# can see that some cryptos don't have info for every date
# checking number of entries for each crypto
value_count = selected_crypto_rdd.map(lambda x: (x[2],1)) \
                    .reduceByKey(lambda a,b: a+b).sortBy(lambda x: (-x[1])).collect()
for i in value_count:
    print(i)

('Binance Coin', 1097)
('Bitcoin', 1097)
('Ethereum', 1097)
('Dogecoin', 1097)
('Cardano', 1097)
('Tether', 1097)
('Litecoin', 1097)
('XRP', 1097)
('Polkadot', 315)
('Uniswap', 287)


In [None]:
# Some values are missing for the last two currencies but cannot ignore them as they are important

In [106]:
# checking for any other missing values
print(selected_crypto_rdd.count())
df = sqlContext.createDataFrame(selected_crypto_rdd)
print(df.na.drop().rdd.count())
# counts before and after removing missing values is the same
# Hence no missing values

9378
9378


In [107]:
# checking for any missing values with pandas as well
import pandas as pd
df_pd = pd.read_csv('crypto_data.csv')
df_pd.isna().sum()

index        0
SNo          0
Name         0
Symbol       0
Date         0
High         0
Low          0
Open         0
Close        0
Volume       0
Marketcap    0
dtype: int64

In [108]:
!pwd

/Users/harry/Desktop/Project/crypto_data


In [109]:
# saving the file
df.write.csv('crypto10_final')

In [110]:
! cat ../crypto10_final/part* > crypto10_final.csv

In [96]:
!pwd

/Users/harry/Desktop/Project/crypto_data
