In [None]:
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import sum
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

In [None]:
#Copyright 2022 Google LLC.
#SPDX-License-Identifier: Apache-2.0
#Creating spark session.
spark = SparkSession.builder.appName('twitter_data').getOrCreate()

In [None]:
#Reading arguments and storing in variables.
bucket_name=''

In [None]:
#Reading tweets data and storing in dataframe.
tweets = spark.read.csv('gs://'+bucket_name+'/social_media_data_analytics/01-datasets/tweets_with_latitude_longitude_*.csv',header=True, inferSchema=True)

In [None]:
def trans(x):
    y = " ".join([word for word in x.split()
                                if 'http' not in word and '@' not in word and '<' not in word])
    k=re.sub('[!@#$:).;,?&]', '', y.lower())
    z=re.sub('  ', ' ', k)
    return z

In [None]:
transformUDF = udf(lambda z:trans(z),StringType())
tweets=tweets.withColumn("text new", transformUDF(col("text")))

In [None]:
def location(x) :
    if x==None :
        y= ' '
    else :
        y=str(x)
    return y

In [None]:
locationUDF = udf(lambda z:location(z),StringType())
tweets=tweets.withColumn("location new", locationUDF(col("location")))

In [None]:
def source_new(x):
    y=''
    p=str(x)
    m = re.search('(?i)<a([^>]+)>(.+?)</a>', p)
    try:
        y=m.group(0)
    except AttributeError:
        y=p
    #z=re.sub('', ' ', y)
    z=y.replace('', ' ')  
    return y

In [None]:
sourcenewUDF = udf(lambda z:source_new(z),StringType())
tweets=tweets.withColumn("source new", sourcenewUDF(col("source")))

In [None]:
tweets=tweets.withColumn('followers_count_new',col('followers_count').cast(IntegerType()))
tweets_by_type=tweets.groupBy('source new').sum('followers_count_new')
tweets_by_type=tweets_by_type.filter(col('sum(followers_count_new)').isNotNull()) 
tweets_by_type=tweets_by_type.withColumn('count',col('sum(followers_count_new)').cast(IntegerType())).drop(col('sum(followers_count_new)'))
tweets_by_type=tweets_by_type.filter(col('count') >100000)         

In [None]:
def source_new_2(x):
    source_new2=''
    if x not in ['Twitter for Android ','Instagram ','Twitter Web Client ','Twitter for iPhone ']:
        source_new2 = 'Others'
    else:
        source_new2 = x
    return source_new2

In [None]:
sourcenew2UDF = udf(lambda z:source_new_2(z),StringType())
tweets=tweets.withColumn("source_new2", sourcenew2UDF(col("source")))

In [None]:
tweets=tweets.toPandas()

In [None]:
def wordcloud(tweets,col):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color="white",stopwords=stopwords,random_state = 2016).generate(" ".join([i for i in tweets[col]]))
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("Good Morning Datascience+")

In [None]:
#Plotting using word cloud
wordcloud(tweets,'text new')

In [None]:
wordcloud(tweets, 'location new')

In [None]:
tweets_by_type=tweets_by_type.toPandas()

In [None]:
#plotting the graph
import seaborn as sns
plt.rcParams["figure.figsize"] = (90,30)
sns.barplot(tweets_by_type['source new'], y=tweets_by_type['count'])

In [None]:
tweets_by_type2 = tweets.groupby(['source_new2'])['followers_count'].sum()
tweets_by_type2.rename("",inplace=True)
explode = (1,0,0,0,0)

In [None]:
tweets.groupby(['source_new2'])['followers_count'].count()

In [None]:
#Reading locations data
tweets_location = spark.read.csv('gs://'+bucket_name+'/social_media_data_analytics/01-datasets/location_*.csv',header=True, inferSchema=True)
tweets_location=tweets_location.withColumnRenamed('count','count_num')

In [None]:
from pyspark.sql.functions import *
tweets_agg=tweets_location.groupBy(col('latitude'),col('longitude'),col('location')).sum('count_num')
tweets_agg=tweets_agg.withColumnRenamed('sum(count_num)','count')
tweets_agg=tweets_agg.filter(col('location').isNotNull()).filter(col('latitude').isNotNull()).filter(col('longitude').isNotNull())

In [None]:
q=tweets_agg.toPandas()

In [None]:
import folium
m = folium.Map(location=[20, 0], tiles="Stamen Terrain", zoom_start=2)
for i in range(0,len(q)):
    popup= folium.Popup(q.iloc[i]['location'], parse_html=True)
    folium.Marker([q.iloc[i]['latitude'], q.iloc[i]['longitude']], popup=popup).add_to(m)

In [None]:
#plotting the map
m