In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt

from textblob import TextBlob
import random
import networkx as nx
from nltk import FreqDist, bigrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import seaborn as sns
from wordcloud import WordCloud

In [None]:
# load dataset
data = pd.read_csv("TikTokUserTweets.csv")

In [None]:
def plot_wordcloud(string_df):
    # punctuation list created
    punc= '''!()-[]{};:'"\,=<>./?@#$%^&*_~'''+'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'

    # compress all rows into 1 string only in 'string' variable
    twstring =''.join([str(item) for item in string_df.tolist()])
    for letter in twstring:
        if letter in punc:
            twstring = twstring.replace(letter,"")

    # generated wordcloud
    tweets_wordcloud = WordCloud(width=400,height=400,background_color='white',min_font_size=10,collocations=False).generate(twstring)
    plt.imshow(tweets_wordcloud)
    plt.axis("off")
    plt.show()

### Analysis 1: To identify trending or most commonly used terms or topics on all tweets on TikTok

In [None]:
# Plot wordcloud on all tweets
plot_wordcloud(data['Tweets_Cleaned'])

In [None]:
# Topic Modelling
cleaned_tokens = data['Tweets_Cleaned'].tolist()

id2word=corpora.Dictionary(cleaned_tokens)
print(id2word.token2id)

id2word.filter_extremes(no_below=30,no_above=30)
print(id2word.token2id)

corpus=[id2word.doc2bow(text) for text in cleaned_tokens]

ldamodel=gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,passes=50,iterations=50,num_topics=7,random_state=1)

ldamodel.print_topics(num_words=7)
coherence_model_lda=CoherenceModel(model=ldamodel,texts=cleaned_tokens,dictionary=id2word,coherence='u_mass')

coherence_lda=coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\nCoherence Score: ', coherence_model_lda.get_coherence())

In [None]:
lda_display=pyLDAvis.gensim_models.prepare(ldamodel, corpus, id2word)

pyLDAvis.save_html(lda_display,'lda.html')

tp_list=[]
for i in range(len(ldamodel[corpus])):
    tp=ldamodel[corpus][i]
    tp=sorted(tp, key=lambda tp: tp[1],reverse=True)
    tp_list.append(tp[0][0])

In [None]:
replacement_dict = {
    0: 'Studying content on TikTok and how to get popular',
    1: 'User’s intention and possibly other people offering or requesting help or work on the platform',
    2: 'User’s engagement and opinions',
    3: 'Viral TikTok content, mainly about a girl doing challenge that involves music',
    4: 'Strategizing TikTok posts with inclusion of feedback',
    5: 'Real world and personal topics.',
    6: 'Regarding the data access (most probably by China)'
}

tp_list = [replacement_dict.get(item, item) for item in tp_list]
data['Topic'] = tp_list

In [None]:
for topic in replacement_dict:
    tweets_token = data.loc[data['Topic'] == topic]
    
    plot_wordcloud(tweets_token['Tweets_Cleaned'])

### Analysis 2a: To identify how the public feels/perceives the brand TikTok

In [None]:
text_blob=[]
for tweet in data['Tweets'].tolist():
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity==0:
        sentiment="Neutral"
    elif analysis.sentiment.polarity>0:
        sentiment="Positive"
    elif analysis.sentiment.polarity<0:
        sentiment="Negative"
    text_blob.append(sentiment)
    
data["Sentiment"]=text_blob

data.groupby(by='Sentiment').mean()
data.groupby(by='Sentiment')['Sentiment'].count()

tps=data[['Tweets','Place','Sentiment']]
tps=tps[tps.Sentiment != 'Neutral']

In [None]:
# Positive sentiment wordcloud
positive_tweets = data.loc[data['Sentiment']=='Positive']

plot_wordcloud(positive_tweets['Tweets_Cleaned'])

In [None]:
# Negative sentiment wordcloud
negative_tweets = data.loc[data['Sentiment']=='Negative']

plot_wordcloud(negative_tweets['Tweets_Cleaned'])

### Analysis 2b: To identify how major countries feel/perceive this news

In [None]:
tps.groupby(by=['Place'])['Sentiment'].size()
groupedforcomparison=tps.groupby(by=['Place'])['Sentiment'].size()

othercountrylist=[]
for idx, place in enumerate(groupedforcomparison):
    if place<=15:
        othercountrylist.append(groupedforcomparison.index[idx])

len(othercountrylist)

tps.loc[tps['Place'].isin(othercountrylist),'Place']='Others'
# tps.groupby(by=['Place'])['Sentiment'].size().index[0]
tps.groupby(by=['Place'])['Sentiment'].size()
# tps.groupby(by=['Place','Sentiment']).size()

# pbs=tps.groupby(by=['Place','Sentiment']).size().reset_index()
# pbs=pbs.rename(columns={0:"Count"})
pbs=tps.pivot_table(index='Place',columns='Sentiment',values='Sentiment', aggfunc='size')
ax = pbs.plot(kind="bar",color=["#ca472f","#0b84a5"], grid=True,rot=0, title="Country's Sentiment \non TikTok")
plt.xticks(rotation=30, horizontalalignment="center")
ax.set_xlabel("Country")
ax.set_ylabel("count")
plt.rcParams["figure.figsize"] = [10, 6]
plt.show()

pbs=tps.pivot_table(index='Place',columns='Sentiment',values='Sentiment', aggfunc='size')
pbs['Negative']=pbs['Negative'].fillna(0)
pbs['Positive']=pbs['Positive'].fillna(0)
pbs['Negative']=pbs['Negative']/(pbs['Negative']+pbs['Positive'])*100
pbs['Positive']=100-pbs['Negative']
ax = pbs.plot(kind="bar",color=["#ca472f","#0b84a5"], grid=True,rot=0, title="Country's Sentiment by Percentage \non TikTok")
plt.xticks(rotation=30, horizontalalignment="center")
ax.set_xlabel("Country")
ax.set_ylabel("count")
plt.rcParams["figure.figsize"] = [10, 6]
plt.show()

### Analysis 3: To analyse how the news has impacted @TikTok_US’s engagement rates such as Likes, Retweets, etc.

In [None]:
def get_user_tweets(api, username):
    # create empty list to temporarily hold data for scraping
    tweets=[]
    # use for loop to keep feeding every tweets scraped using user timeline and cursor function to the empty list
    # user timeline help to get the tweets
    # cursor function help to look through the page
    # need to look through the page manually without cursor function
    
    # can also use tweepy.Cursor(api.user_timeline,screen_name=username).pages(): to scrape by page
    for status in tweepy.Cursor(api.user_timeline,screen_name=username).items():
        tweets.append(status)
    return tweets

tiktoktweets=get_user_tweets(extractor, 'tiktok_us')
print("Number of tweets extracted: ", len(tiktoktweets))

In [None]:
timelineTikTokProf=data[['Date','Likes_no','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'], label='Likes')
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'], label='Retweets')
plt.legend(loc="upper right")
plt.title("TikTok_US Engagement Over 3 Years")
plt.figure(figsize=(20,12)) 
plt.show()

timelineTikTokProf=data[data['Date'].dt.date<datetime.strptime('2021','%Y').date()][['Date','Likes_no','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'], label='Likes')
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'], label='Retweets')
plt.legend(loc="upper right")
plt.title("TikTok_US Engagement in 2020")
plt.figure(figsize=(20,12)) 
plt.show()

timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2022','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2021','%Y').date())][['Date','Likes_no','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'], label='Likes')
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'], label='Retweets')
plt.legend(loc="upper left")
plt.title("TikTok_US Engagement in 2021")
plt.figure(figsize=(20,12)) 
plt.show()

timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2023','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2022','%Y').date())][['Date','Likes_no','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'], label='Likes')
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'], label='Retweets')
plt.legend(loc="upper right")
plt.title("TikTok_US Engagement in 2022")
plt.figure(figsize=(20,12)) 
plt.show()

In [None]:
##################################################################### Likes #####################################################################
###### Year 2020##########
timelineTikTokProf=data[data['Date'].dt.date<datetime.strptime('2021','%Y').date()][['Date','Likes_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
plt.figure(figsize=(20,12)) 
plt.show()

most_likes_tweets= pd.DataFrame(columns=['Tweets','Tweets_ID','Date','Source','Likes_no','Retweets_no'])
for i in range(1):
    
    likes_max=np.max(timelineTikTokProf['Likes_no'])
    print("The ", i+1," tweet with the highest like count in this year is:\n ",likes_max)
    most_likes_tweets.append(data[data['Likes_no']==likes_max][['Tweets','Date','Likes_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Likes_no']!=likes_max]
# most_likes_tweets=pd.concat(most_likes_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
plt.figure(figsize=(20,12)) 
plt.show()

###### Year 2021############
timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2022','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2021','%Y').date())][['Date','Likes_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
# plt.rcParams["figure.figsize"] = [20, 12]
plt.figure(figsize=(20,12)) 
plt.show()


# most_likes_tweets=[]
for i in range(3):
    
    likes_max=np.max(timelineTikTokProf['Likes_no'])
    print("The ", i+1," tweet with the highest like count in this year is:\n ",likes_max)
    most_likes_tweets.append(data[data['Likes_no']==likes_max][['Tweets','Date','Likes_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Likes_no']!=likes_max]
# most_likes_tweets=pd.concat(most_likes_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
plt.figure(figsize=(20,12)) 
plt.show()

############ Year 2022##############
timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2023','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2022','%Y').date())][['Date','Likes_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
# plt.rcParams["figure.figsize"] = [20, 12]
plt.figure(figsize=(20,12)) 
plt.show()

# most_likes_tweets=[]
for i in range(3):
    
    likes_max=np.max(timelineTikTokProf['Likes_no'])
    print("The ", i+1," tweet with the highest like count in this year is:\n ",likes_max)
    most_likes_tweets.append(data[data['Likes_no']==likes_max][['Tweets','Date','Likes_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Likes_no']!=likes_max]
# most_likes_tweets=pd.concat(most_likes_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Likes_no'])
plt.figure(figsize=(20,12)) 
plt.show()

##################################################################### Retweets #####################################################################
############## Year 2020############
timelineTikTokProf=data[data['Date'].dt.date<datetime.strptime('2021','%Y').date()][['Date','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()

most_rt_tweets=pd.DataFrame(columns=['Tweets','Tweets_ID','Date','Source','Likes_no','Retweets_no'])
for i in range(1):
    
    rt_max=np.max(timelineTikTokProf['Retweets_no'])
    print("The ", i+1," tweet with the highest retweets count in this year is:\n ",rt_max)
    most_rt_tweets.append(data[data['Retweets_no']==rt_max][['Tweets','Date','Retweets_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Retweets_no']!=rt_max]
# most_rt_tweets=pd.concat(most_rt_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()


############ Year 2021###############
timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2022','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2021','%Y').date())][['Date','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()

# most_rt_tweets=[]
for i in range(3):
    
    rt_max=np.max(timelineTikTokProf['Retweets_no'])
    print("The ", i+1," tweet with the highest retweets count in this year is:\n ",rt_max)
    most_rt_tweets.append(data[data['Retweets_no']==rt_max][['Tweets','Date','Retweets_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Retweets_no']!=rt_max]
# most_rt_tweets=pd.concat(most_rt_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()

################ Year 2022####################
timelineTikTokProf=data[np.logical_and(data['Date'].dt.date<datetime.strptime('2023','%Y').date() ,
                                       data['Date'].dt.date>=datetime.strptime('2022','%Y').date())][['Date','Retweets_no']]
plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()

# most_rt_tweets=[]
for i in range(3):
    
    rt_max=np.max(timelineTikTokProf['Retweets_no'])
    print("The ", i+1," tweet with the highest retweets count in this year is:\n ",rt_max)
    most_rt_tweets.append(data[data['Retweets_no']==rt_max][['Tweets','Date','Retweets_no']])
    
    timelineTikTokProf=timelineTikTokProf[timelineTikTokProf['Retweets_no']!=rt_max]
# most_rt_tweets=pd.concat(most_rt_tweets)

plt.plot(timelineTikTokProf['Date'],timelineTikTokProf['Retweets_no'])
plt.figure(figsize=(20,12)) 
plt.show()

In [None]:
def keyword_tweetsandreplies(api,keyword,number_of_tweets):
    # usually when searched by keywords, original tweets and retweets are given
    # this will filter out retweets giving only the original tweets
    new_keyword=keyword+' -filter:retweets'
    
    tweets=[]
    # instead of user timeline, we use search function
    for status in tweepy.Cursor(api.search_tweets, q=new_keyword, 
                                lang="en", tweet_mode='extended', 
                                result_type='mixed').items(number_of_tweets):
        tweets.append(status)
        # replykey='(to:'+status.user.screen_name+') since:'+status.created_at.strftime("%Y-%m-%d")
        # for stuff in tweepy.Cursor(api.search_tweets, q=replykey, 
        #                             lang="en", tweet_mode='extended').items(number_of_tweets):
        #     if stuff.in_reply_to_status_id_str == status.id:
        #         tweets.append(stuff)
    return tweets


tiktok_newstweets=keyword_tweetsandreplies(extractor,"https://www.businessinsider.com/tiktok-confirms-us-user-data-accessed-in-china-bytedance-2022-7",2400)

# create a panda DataFrame by looping through each element and add it to the DataFrame
data = pd.DataFrame(data=[tweet.full_text for tweet in tiktok_newstweets], 
                    columns=['Tweets'])
data['Tweets_ID'] = [tweet.id for tweet in tiktok_newstweets]
data['Date'] = [tweet.created_at for tweet in tiktok_newstweets]
data['Source'] = [tweet.source for tweet in tiktok_newstweets]
data['Likes_no'] = [tweet.favorite_count for tweet in tiktok_newstweets]
data['Retweets_no'] = [tweet.retweet_count for tweet in tiktok_newstweets]

data=data.drop_duplicates(subset=('Tweets'))

data.to_csv("TikTokNewsTweets.csv")

def remove_noise(tweet_tokens, stop_words):
    cleaned_tokens=[]
    for token in tweet_tokens:
        token = re.sub('http([!-~]+)?','',token)
        token = re.sub('//t.co/[A-Za-z0-9]+','',token)
        token = re.sub('(@[A-Za-z0-9_]+)','',token)
        token = re.sub('[0-9]','',token)
        token = re.sub('[^ -~]','',token)
        token = re.sub(emoji.get_emoji_regexp(), "", token)
        token = token.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
        token = re.sub('[^\x00-\x7f]','', token) 
        token = re.sub("\s\s+" , " ", token)
        if (len(token)>3) and (token not in string.punctuation) and (token.lower() not in stop_words):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens


# important to remove as many unnecessary words as possible
# requires self explaratory and self understanding of problems and context
stop_words=stopwords.words('english')
tweets_token=data['Tweets'].apply(word_tokenize).tolist()

cleaned_tokens=[]
# lemmanise and remove noise function to help clean the data
# at the end returns the list of 'cleaned tokens'
for token in tweets_token:
    rm_noise =remove_noise(token, stop_words)
    # lemma_tokens=lemmatize_sentence(rm_noise)
    # cleaned_tokens.append(lemma_tokens)
    cleaned_tokens.append(rm_noise)

tweet_list=[]
for tokens in cleaned_tokens:
    toke=' '. join([str(token) for token in tokens])
    tweet_list.append(toke)

data['CleanTweet']=tweet_list

# data=data.drop_duplicates(subset=('CleanTweet'))
twstring=''. join([str(item) for item in data['CleanTweet']])
twstring = re.sub('(?:^|\W)data(?:$|\W)|(?:^|\W)user(?:$|\W)|(?:^|\W)cleaned(?:$|\W)|(?:^|\W)confirms(?:$|\W)|(?:^|\W)confirm(?:$|\W)','',twstring)
# twstring =''. join([str(item) for item in tweet_list])
tweets_wordcloud=WordCloud(width=400,height=400,
                            background_color='white',
                            min_font_size=10,collocations=False).generate(twstring)
plt.imshow(tweets_wordcloud)
plt.axis("off")
plt.show()

text_blob=[]
for tweet in data['Tweets'].tolist():
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity==0:
        sentiment="Neutral"
    elif analysis.sentiment.polarity>0:
        sentiment="Positive"
    elif analysis.sentiment.polarity<0:
        sentiment="Negative"
    text_blob.append(sentiment)
    
data["Sentiment"]=text_blob

data.groupby(by='Sentiment').mean()
data.groupby(by='Sentiment')['Sentiment'].count()