In [0]:
!pip install emojis
!pip install nltk
import emojis
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import  *
from pyspark.sql import DataFrame
from collections import Counter
from typing import Iterable
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
data = spark.read.csv('/FileStore/tables/All_files-1.csv', header = 'TRUE')
data.cache()   # cache to speed up following operations
display(data)

user1,user2,transaction_type,datetime,description,story_id
2.48E+18,2.08e+18,payment,2018-08-07T02:11:16,fuk ya,2.54e+18
2.36E+18,2.54e+18,payment,2018-08-07T02:11:16,:venmo_dollar:,2.54e+18
2.28E+18,1.92e+18,payment,2018-08-07T02:11:15,🎉,2.54e+18
2.09E+18,2.02e+18,payment,2018-08-07T02:11:15,Boyz,2.54e+18
2.03E+18,2.08e+18,payment,2018-08-07T02:11:15,🥩,2.54e+18
2.30E+18,2.23e+18,payment,2018-08-07T02:11:16,Internet,2.54e+18
1.84E+18,1.86e+18,payment,2018-08-07T02:11:15,🤗,2.54e+18
2.13E+18,1.41e+18,payment,2018-08-07T02:11:15,🚕🚕🚕,2.54e+18
2.35E+18,2.42e+18,payment,2018-08-07T02:11:16,🤬,2.54e+18
2.31E+18,2.47e+18,payment,2018-08-07T02:11:14,seltzer,2.54e+18


In [0]:
# increase partition to (hopefully) speed up
data = data.repartition(200)
data.rdd.getNumPartitions()

In [0]:
text_dic = spark.read.format("csv") \
  .option("header", "true")  \
  .option("sep", ",") \
  .load("/FileStore/tables/word_classification_dict-5.csv")
display(text_dic)

People,Food,Event,Activity,Travel,Transportation,Utility,Cash,Illegal/Sarcasm
friend,food,birthday,ball,beach,lyft,bill,atm,addiction
friendship,bbq,christmas,boat,place,uber,cable,bank,drug
baby,bean,happy,bar,la,cab,fee,cash,wangs
boy,latte,bday,book,world,bus,electric,money,weed
girl,breakfast,wedding,club,hotel,car,electricity,buck,anal
help,brunch,xmas,card,trip,gas,internet,wallet,bj
like,burger,holiday,dance,vega,taxi,rent,monies,blowjob
love,burrito,hbd,football,tahoe,ride,wifi,tip,boob
mom,cake,halloween,fun,nyc,rental,utility,dollar,booty
save,cheese,thanksgiving,game,dc,train,tax,payback,blow


In [0]:
emoji_dic = spark.read.format("csv") \
  .option("header", "true")  \
  .option("sep", ",") \
  .load("/FileStore/tables/emoji_classification_dictionary-6.csv")
display(emoji_dic)

Event,Travel,Food,Activity,Transportation,People,Utility
🇦🇺,🏔,🍇,👾,🚄,😀,⚡
🇫🇷,⛰,🍈,🕴,🚅,😃,💡
🎂,🌋,🍉,🎪,🚆,😄,🔌
🛍,🗻,🍊,🎭,🚇,😁,📺
🇨🇦,🏕,🍋,🎨,🚈,😆,🔌
🇧🇷,🏖,🍌,🎰,🚉,😅,⚡
🐉,🏜,🍍,🚣,🚊,🤣,💡
🎅,🏝,🍎,🛀,🚝,😂,💸
🇲🇽,🏞,🍏,🎗,🚞,🙂,💦
🇨🇳,🏟,🍐,🎟,🚋,🙃-,💧


In [0]:
people = text_dic.select('People').rdd.flatMap(lambda x: x).collect()
food = text_dic.select('Food').rdd.flatMap(lambda x: x).collect()
event = text_dic.select('Event').rdd.flatMap(lambda x: x).collect()
activity = text_dic.select('Activity').rdd.flatMap(lambda x: x).collect()
travel = text_dic.select('Travel').rdd.flatMap(lambda x: x).collect()
transportation = text_dic.select('Transportation').rdd.flatMap(lambda x: x).collect()
utility = text_dic.select('Utility').rdd.flatMap(lambda x: x).collect()
cash = text_dic.select('Cash').rdd.flatMap(lambda x: x).collect()
illegal = text_dic.select('Illegal/Sarcasm').rdd.flatMap(lambda x: x).collect()

people_emoji = emoji_dic.select('People').rdd.flatMap(lambda x: x).collect()
food_emoji = emoji_dic.select('Food').rdd.flatMap(lambda x: x).collect()
event_emoji = emoji_dic.select('Event').rdd.flatMap(lambda x: x).collect()
activity_emoji = emoji_dic.select('Activity').rdd.flatMap(lambda x: x).collect()
travel_emoji = emoji_dic.select('Travel').rdd.flatMap(lambda x: x).collect()
transportation_emoji = emoji_dic.select('Transportation').rdd.flatMap(lambda x: x).collect()
utility_emoji = emoji_dic.select('Utility').rdd.flatMap(lambda x: x).collect()

In [0]:
import re

In [0]:
# use the emojis package instead of the provided dictionary, not sure if it's allowed, need to check with professor
@udf
def convert_emojis(text):
    text = " ".join(emojis.decode(text).replace(":", " ").replace("_", "").split().re.sub("[^a-zA-Z]"," ", str(text)))
    return text

In [0]:
@udf
def rm_punctuation(text):
  return text.translate(str.maketrans("","", string.punctuation))

In [0]:
def lemmatize(text):
    lemmatized_array = []
    lemmatizer = WordNetLemmatizer()
    for word in text:
      lemmatized_array.append(lemmatizer.lemmatize(word))
    return lemmatized_array
lemmatize_udf = udf(lemmatize, ArrayType(StringType()))

In [0]:
data = spark.read.csv("/FileStore/tables/withColumn___Sheet2.csv", header = 'TRUE')
data1.cache()   # cache to speed up following operations
display(data)

user1,user2,transaction_type,datetime,description,is_business,story_id,text_preprocessed,classification,emoji_only
1293545,3555063,payment,2015-06-28T23:51:11.000+0000,🍣🐠🍻,False,559025ff1a3b580f92a5025e,"List(sushi, tropicalfish, beer)",Food,1
6785327,5649602,payment,2016-06-14T08:11:20.000+0000,☎ 😘,False,575f59b8cd03c9af22380385,"List(☎, kissingheart)",Not Classified,0
2514131,1531398,payment,2016-05-08T07:17:53.000+0000,NyQuil x2,False,572e85b1cd03c9af22eb2b82,"List(nyquil, x2)",Not Classified,0
1003716,703145,payment,2015-02-24T02:59:58.000+0000,Food!,False,54eb78aecd03c9af22d7f0c0,List(food),Food,0
483288,314523,charge,2015-04-05T23:28:48.000+0000,BD,False,552162c0ca81793bbe80320b,List(bd),Not Classified,0
3292421,3219776,payment,2015-02-05T06:08:34.000+0000,👽,False,54d2986291bd05aa933ea834,List(alien),Not Classified,1
2014167,4633184,charge,2015-09-28T21:44:19.000+0000,PGW,False,56095243cd03c9af220240d3,List(pgw),Not Classified,0
1151815,1474712,charge,2015-03-31T08:31:02.000+0000,for rent,False,5519f8d75d6cc87743b26b1c,List(rent),Utility,0
481776,324017,charge,2016-02-08T06:29:19.000+0000,Supabowl groceries,False,56b7c53fcd03c9af220e7730,"List(supabowl, grocery)",Food,0
288481,571272,charge,2015-10-06T05:02:58.000+0000,🍴🚿,False,5612f392cd03c9af22819aaf,"List(forkandknife, shower)",Not Classified,1


In [0]:
# 21% of transactions are emoji only
percent_emoji_only = data.select(sum("emoji_only")/data.count()).show()
percent_emoji_only

In [0]:
def get_emoji(text):
  if emojis.get(text):
    return list(emojis.get(text))
get_emoji_udf = udf(get_emoji, ArrayType(StringType()))

data = data.withColumn("emojis", get_emoji_udf(col("description")))
display(data)



user1,user2,transaction_type,datetime,description,is_business,story_id,text_preprocessed,classification,emoji_only,emojis
1293545,3555063,payment,2015-06-28T23:51:11.000+0000,🍣🐠🍻,False,559025ff1a3b580f92a5025e,"List(sushi, tropicalfish, beer)",Food,1,"List(🐠, 🍻, 🍣)"
6785327,5649602,payment,2016-06-14T08:11:20.000+0000,☎ 😘,False,575f59b8cd03c9af22380385,"List(☎, kissingheart)",Not Classified,0,List(😘)
2514131,1531398,payment,2016-05-08T07:17:53.000+0000,NyQuil x2,False,572e85b1cd03c9af22eb2b82,"List(nyquil, x2)",Not Classified,0,
1003716,703145,payment,2015-02-24T02:59:58.000+0000,Food!,False,54eb78aecd03c9af22d7f0c0,List(food),Food,0,
483288,314523,charge,2015-04-05T23:28:48.000+0000,BD,False,552162c0ca81793bbe80320b,List(bd),Not Classified,0,
3292421,3219776,payment,2015-02-05T06:08:34.000+0000,👽,False,54d2986291bd05aa933ea834,List(alien),Not Classified,1,List(👽)
2014167,4633184,charge,2015-09-28T21:44:19.000+0000,PGW,False,56095243cd03c9af220240d3,List(pgw),Not Classified,0,
1151815,1474712,charge,2015-03-31T08:31:02.000+0000,for rent,False,5519f8d75d6cc87743b26b1c,List(rent),Utility,0,
481776,324017,charge,2016-02-08T06:29:19.000+0000,Supabowl groceries,False,56b7c53fcd03c9af220e7730,"List(supabowl, grocery)",Food,0,
288481,571272,charge,2015-10-06T05:02:58.000+0000,🍴🚿,False,5612f392cd03c9af22819aaf,"List(forkandknife, shower)",Not Classified,1,"List(🍴, 🚿)"


In [0]:
bag_of_emojis = data.filter(col("emojis").isNotNull()).select("emojis").rdd.flatMap(lambda x: x).collect()
bag_of_emojis_flatten = []
for item in bag_of_emojis:
  for emoji in item:
    bag_of_emojis_flatten.append(emoji)
bag_of_emojis_flatten

In [0]:
# the top 5 most popular emoji: '💸', '🍕', '🍻', '🎉', '🍷'
Counter(bag_of_emojis_flatten).most_common(5)

In [0]:
def get_emoji_category(emoji):
  if emoji in people_emoji:
    return 'People'
  elif emoji in food_emoji:
    return 'Food'
  elif emoji in event_emoji:
    return 'Event'
  elif emoji in activity_emoji:
    return 'Activity'
  elif emoji in travel_emoji:
    return 'Travel'
  elif emoji in transportation_emoji:
    return 'Transportation'
  elif emoji in utility_emoji:
    return 'Utility'
  else:
    return 'Not Classified'

In [0]:
# use dictionary to speed up the running time
emoji_category_dic = dict()
emoji_category_arr = []

for emoji in bag_of_emojis_flatten:
  if emoji not in emoji_category_dic:
    emoji_category_dic[emoji] = get_emoji_category(emoji)
    emoji_category_arr.append(emoji_category_dic[emoji])
  else:
    emoji_category_arr.append(emoji_category_dic[emoji])

emoji_category_arr = [x for x in emoji_category_arr if x != "Not Classified"]
emoji_category_arr

In [0]:
# the top three most popular emoji categories are Food, People, Activity
Counter(emoji_category_arr).most_common(3)

In [0]:
data.createOrReplaceTempView("datatable")

In [0]:
%sql
-- merge user1 and user2
CREATE TABLE unionp_table
USING HIVE
AS 
  SELECT user1 AS user, classification
  FROM datatable
  UNION ALL
  SELECT user2 AS user, classification
  FROM datatable

In [0]:
# assumping the denominator doesn't include transactions that can't be classified
spend_profile_df = sqlContext.sql(
  '''
  SELECT user, CONCAT_WS(", ", 
                    CASE WHEN count_activity != 0 THEN CONCAT(ROUND(count_activity * 100), "% Activity") ELSE NULL END, 
                    CASE WHEN count_people != 0 THEN CONCAT(ROUND(count_people * 100), "% People") ELSE NULL END, 
                    CASE WHEN count_event != 0 THEN CONCAT(ROUND(count_event * 100), "% Event") ELSE NULL END, 
                    CASE WHEN count_travel != 0 THEN CONCAT(ROUND(count_travel * 100), "% Travel") ELSE NULL END, 
                    CASE WHEN count_transportation != 0 THEN CONCAT(ROUND(count_transportation * 100), "% Transportation") ELSE NULL END, 
                    CASE WHEN count_food != 0 THEN CONCAT(ROUND(count_food * 100), "% Food") ELSE NULL END, 
                    CASE WHEN count_utility != 0 THEN CONCAT(ROUND(count_utility * 100), "% Utility") ELSE NULL END, 
                    CASE WHEN count_cash != 0 THEN CONCAT(ROUND(count_cash * 100), "% Cash") ELSE NULL END,
                    CASE WHEN count_illegal != 0 THEN CONCAT(ROUND(count_illegal * 100), "% Illegal") ELSE NULL END
              ) AS spent_profile
  FROM
  (
    SELECT user,
        SUM(CASE WHEN classification == "Activity" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_activity,
        SUM(CASE WHEN classification == "Food" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_food,
        SUM(CASE WHEN classification == "People" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_people,
        SUM(CASE WHEN classification == "Event" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_event,
        SUM(CASE WHEN classification == "Travel" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_travel,
        SUM(CASE WHEN classification == "Transportation" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_transportation,
        SUM(CASE WHEN classification == "Utility" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_utility,
        SUM(CASE WHEN classification == "Cash" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_cash,
        SUM(CASE WHEN classification == "Illegal" THEN 1 ELSE 0 END)/SUM(CASE WHEN classification != "Not Classified" THEN 1 ELSE 0 END) AS count_illegal
    FROM union_table
    GROUP BY user
  )
  '''
)
display(spend_profile_df)

user,spent_profile
1218774,100.0% Transportation
2954885,100.0% Food
974886,100.0% Food
2854200,
2517841,100.0% Food
2073451,100.0% Utility
3015875,
8894083,
3413708,100.0% Utility
6594589,


In [0]:
#2.4.5 version 
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
from pyspark.sql.functions import *
from pyspark.sql.functions import col, column
import re
import pandas as pd
from pyspark.sql.functions import desc, asc


# spark = SparkSession.builder.master("local[*]").getOrCreate()
MAX_MEMORY = '12g'

spark = SparkSession\
        .builder\
        .appName('Venmo')\
        .config("spark.executor.memory", MAX_MEMORY) \
        .config("spark.driver.memory", MAX_MEMORY) \
        .getOrCreate()


from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/Shared\ drives/
directory=os.getcwd()
directory=directory+"/BAX-423 Big Data Analytics/Venmo/"

