# set up

In [1]:
# dataframe
import pandas as pd
import numpy as np
import datetime
import re
import json
from striprtf.striprtf import rtf_to_text

# display
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',None)

# to plot
import matplotlib.pyplot as plt 

# detect language - not really effective
# from langdetect import detect, LangDetectException, detect_langs
# import langid

# NLP
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

# tiktok users by country

In [2]:
tiktok_users = pd.read_csv('tiktok-users-by-country-2024.csv')
tiktok_users.head()

Unnamed: 0,country,TikTokUsers_Users_2024,TikTokUsersCountryTotal2023
0,United States,148000000,113300000.0
1,Indonesia,126800000,109900000.0
2,Pakistan,59630000,16510000.0
3,Nigeria,27390000,
4,Brazil,101800000,82210000.0


In [3]:
tiktok_users = tiktok_users[tiktok_users['country'] != 'Total']
tiktok_users['TikTokUsers_Users_2024'].max()

148000000

In [4]:
def categorize_users(row):
    ranges = [
        (1000, '<1K users'),
        (10_000, '1K-10K users'),
        (100_000, '10K-100K users'),
        (1_000_000, '100K-1M users'),
        (10_000_000, '1M-10M users'),
        (100_000_000, '10M-100M users'),
        (150_000_000, '100M-150M users')
    ]
    
    for limit, label in ranges:
        if row < limit:
            return label
    return '>150.000.000 users'

In [5]:
tiktok_users['range'] = tiktok_users['TikTokUsers_Users_2024'].apply(categorize_users)
tiktok_users.head()

Unnamed: 0,country,TikTokUsers_Users_2024,TikTokUsersCountryTotal2023,range
0,United States,148000000,113300000.0,100M-150M users
1,Indonesia,126800000,109900000.0,100M-150M users
2,Pakistan,59630000,16510000.0,10M-100M users
3,Nigeria,27390000,,10M-100M users
4,Brazil,101800000,82210000.0,100M-150M users


In [6]:
tiktok_users.to_csv('users_countries_2024.csv',index=False)

# clean comment 

## create general dataset

In [None]:
# Original dataset
# review = pd.read_csv('review_2024_en_tiktok.csv')
# select wanted columns
# review = review[['reviewId','content','score','thumbsUpCount','at']]
# review = review[review['content'].notna()]

### remove numbers and symbols

In [8]:
# Create dataset for analyze further
# def clean_text(row):
#     row = re.sub(r'\d+', '', row)  # Remove numbers
#     row = re.sub(r'\W+', ' ', row)  # Remove special characters
#     row = row.lower()  # Convert to lowercase
#     return row

# review['content'] = review['content'].apply(clean_text)
# review.to_csv('review_en.csv', index = False)

### language detect

In [9]:
# this step is proceed on google sheet
# import result data 
comment = pd.read_csv("detect language - Sheet1.csv")

# remove float value
comment = comment[comment['content'].apply(lambda x: not isinstance(x, float))]
# only using comment in English
comment = comment[comment['language'] == 'en']

comment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57222 entries, 3 to 72348
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       57222 non-null  object
 1   content        57222 non-null  object
 2   score          57222 non-null  int64 
 3   thumbsUpCount  57222 non-null  int64 
 4   at             57222 non-null  object
 5   language       57222 non-null  object
dtypes: int64(2), object(4)
memory usage: 3.1+ MB


### tokenized

In [10]:
comment['tokenized_text'] = comment['content'].apply(word_tokenize)
comment['word_count'] = comment['tokenized_text'].apply(len)

### remove stop word

In [11]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]
comment['main_word'] = comment['tokenized_text'].apply(remove_stop_words)

### lemmatizer

In [12]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the function to lemmatize words
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Apply the lemmatization function to the 'filtered_words' column
comment['lemmatized_words'] = comment['main_word'].apply(lemmatize_words)

### sentiment

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# Define a function to compute sentiment scores
def get_sentiment_scores(text):
    return sia.polarity_scores(text)

# Apply the sentiment analysis function to the 'text' column
comment['sentiment_calculation'] = comment['content'].apply(get_sentiment_scores)

# If you want to expand the sentiment scores into separate columns:
comment_sentiment = comment['sentiment_calculation'].apply(pd.Series)

# Combine the original DataFrame with the sentiment scores
comment = pd.concat([comment, comment_sentiment], axis=1)

In [14]:
comment.head()

Unnamed: 0,reviewId,content,score,thumbsUpCount,at,language,tokenized_text,word_count,main_word,lemmatized_words,sentiment_calculation,neg,neu,pos,compound
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,5,0,2024-01-15 16:31:21,en,"[a, app, tiktok, is, funny, but, thre, are, bad]",9,"[app, tiktok, funny, thre, bad]","[app, tiktok, funny, thre, bad]","{'neg': 0.374, 'neu': 0.472, 'pos': 0.154, 'compound': -0.5859}",0.374,0.472,0.154,-0.5859
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,5,0,2024-03-02 10:01:36,en,"[a, appreciate, the, tiktok, because, she, make, me, happy]",9,"[appreciate, tiktok, make, happy]","[appreciate, tiktok, make, happy]","{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.7506}",0.0,0.484,0.516,0.7506
8,c30f9c22-6811-401c-a407-66a81ce052fc,a bad medsos they only think about money n a bias media,1,0,2024-02-18 12:42:37,en,"[a, bad, medsos, they, only, think, about, money, n, a, bias, media]",12,"[bad, medsos, think, money, n, bias, media]","[bad, medsos, think, money, n, bias, medium]","{'neg': 0.412, 'neu': 0.588, 'pos': 0.0, 'compound': -0.5994}",0.412,0.588,0.0,-0.5994
10,e5e4ba8e-ff8e-499f-8f00-8a66d22168aa,a bad racist and biased application towards foreign parties which i do not recommend,1,2,2024-06-15 10:52:19,en,"[a, bad, racist, and, biased, application, towards, foreign, parties, which, i, do, not, recommend]",14,"[bad, racist, biased, application, towards, foreign, parties, recommend]","[bad, racist, biased, application, towards, foreign, party, recommend]","{'neg': 0.547, 'neu': 0.327, 'pos': 0.126, 'compound': -0.8406}",0.547,0.327,0.126,-0.8406
16,a35f663e-2089-48fd-acf6-371a7fd74478,a bit disappointed because when you installed again the draft will disappear,2,1,2024-02-24 21:53:27,en,"[a, bit, disappointed, because, when, you, installed, again, the, draft, will, disappear]",12,"[bit, disappointed, installed, draft, disappear]","[bit, disappointed, installed, draft, disappear]","{'neg': 0.357, 'neu': 0.643, 'pos': 0.0, 'compound': -0.6124}",0.357,0.643,0.0,-0.6124


In [15]:
## is this ok?
comment['sentiment_overal'] = (comment['score'] + comment['compound'])/2

In [16]:
def sentiment_catagory(row):
    if row <= 1:
        return 'negative'
    elif row >=2 : 
        return 'positive'
    else:
        return 'neutral'
    
comment['sentiment'] = comment['sentiment_overal'].apply(sentiment_catagory)

comment['sentiment'].value_counts()

sentiment
positive    38811
negative    12843
neutral      5568
Name: count, dtype: int64

In [17]:
comment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57222 entries, 3 to 72348
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   reviewId               57222 non-null  object 
 1   content                57222 non-null  object 
 2   score                  57222 non-null  int64  
 3   thumbsUpCount          57222 non-null  int64  
 4   at                     57222 non-null  object 
 5   language               57222 non-null  object 
 6   tokenized_text         57222 non-null  object 
 7   word_count             57222 non-null  int64  
 8   main_word              57222 non-null  object 
 9   lemmatized_words       57222 non-null  object 
 10  sentiment_calculation  57222 non-null  object 
 11  neg                    57222 non-null  float64
 12  neu                    57222 non-null  float64
 13  pos                    57222 non-null  float64
 14  compound               57222 non-null  float64
 15  sentime

In [18]:
comment.head()

Unnamed: 0,reviewId,content,score,thumbsUpCount,at,language,tokenized_text,word_count,main_word,lemmatized_words,sentiment_calculation,neg,neu,pos,compound,sentiment_overal,sentiment
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,5,0,2024-01-15 16:31:21,en,"[a, app, tiktok, is, funny, but, thre, are, bad]",9,"[app, tiktok, funny, thre, bad]","[app, tiktok, funny, thre, bad]","{'neg': 0.374, 'neu': 0.472, 'pos': 0.154, 'compound': -0.5859}",0.374,0.472,0.154,-0.5859,2.20705,positive
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,5,0,2024-03-02 10:01:36,en,"[a, appreciate, the, tiktok, because, she, make, me, happy]",9,"[appreciate, tiktok, make, happy]","[appreciate, tiktok, make, happy]","{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.7506}",0.0,0.484,0.516,0.7506,2.8753,positive
8,c30f9c22-6811-401c-a407-66a81ce052fc,a bad medsos they only think about money n a bias media,1,0,2024-02-18 12:42:37,en,"[a, bad, medsos, they, only, think, about, money, n, a, bias, media]",12,"[bad, medsos, think, money, n, bias, media]","[bad, medsos, think, money, n, bias, medium]","{'neg': 0.412, 'neu': 0.588, 'pos': 0.0, 'compound': -0.5994}",0.412,0.588,0.0,-0.5994,0.2003,negative
10,e5e4ba8e-ff8e-499f-8f00-8a66d22168aa,a bad racist and biased application towards foreign parties which i do not recommend,1,2,2024-06-15 10:52:19,en,"[a, bad, racist, and, biased, application, towards, foreign, parties, which, i, do, not, recommend]",14,"[bad, racist, biased, application, towards, foreign, parties, recommend]","[bad, racist, biased, application, towards, foreign, party, recommend]","{'neg': 0.547, 'neu': 0.327, 'pos': 0.126, 'compound': -0.8406}",0.547,0.327,0.126,-0.8406,0.0797,negative
16,a35f663e-2089-48fd-acf6-371a7fd74478,a bit disappointed because when you installed again the draft will disappear,2,1,2024-02-24 21:53:27,en,"[a, bit, disappointed, because, when, you, installed, again, the, draft, will, disappear]",12,"[bit, disappointed, installed, draft, disappear]","[bit, disappointed, installed, draft, disappear]","{'neg': 0.357, 'neu': 0.643, 'pos': 0.0, 'compound': -0.6124}",0.357,0.643,0.0,-0.6124,0.6938,negative


### time cleaning

In [19]:
comment['comment_time'] = pd.to_datetime(comment['at'])
comment['comment_date'] = pd.to_datetime(comment['at']).dt.date
comment = comment[comment['comment_date'] <= datetime.date(2024, 7, 31)]
comment['comment_month'] = pd.to_datetime(comment['at']).dt.strftime('%b')
comment['comment_week'] = pd.to_datetime(comment['at']).dt.isocalendar().week
comment['comment_day'] = pd.to_datetime(comment['at']).dt.day
comment['comment_weekday'] = pd.to_datetime(comment['at']).dt.day_name()
comment['comment_time'] = pd.to_datetime(comment['at']).dt.time

In [20]:
def categorize_time_of_day(time):
    return f"{time.hour:02d}:00"

comment['time_of_day'] = comment['comment_time'].apply(categorize_time_of_day)

In [21]:
comment.head()

Unnamed: 0,reviewId,content,score,thumbsUpCount,at,language,tokenized_text,word_count,main_word,lemmatized_words,sentiment_calculation,neg,neu,pos,compound,sentiment_overal,sentiment,comment_time,comment_date,comment_month,comment_week,comment_day,comment_weekday,time_of_day
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,5,0,2024-01-15 16:31:21,en,"[a, app, tiktok, is, funny, but, thre, are, bad]",9,"[app, tiktok, funny, thre, bad]","[app, tiktok, funny, thre, bad]","{'neg': 0.374, 'neu': 0.472, 'pos': 0.154, 'compound': -0.5859}",0.374,0.472,0.154,-0.5859,2.20705,positive,16:31:21,2024-01-15,Jan,3,15,Monday,16:00
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,5,0,2024-03-02 10:01:36,en,"[a, appreciate, the, tiktok, because, she, make, me, happy]",9,"[appreciate, tiktok, make, happy]","[appreciate, tiktok, make, happy]","{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.7506}",0.0,0.484,0.516,0.7506,2.8753,positive,10:01:36,2024-03-02,Mar,9,2,Saturday,10:00
8,c30f9c22-6811-401c-a407-66a81ce052fc,a bad medsos they only think about money n a bias media,1,0,2024-02-18 12:42:37,en,"[a, bad, medsos, they, only, think, about, money, n, a, bias, media]",12,"[bad, medsos, think, money, n, bias, media]","[bad, medsos, think, money, n, bias, medium]","{'neg': 0.412, 'neu': 0.588, 'pos': 0.0, 'compound': -0.5994}",0.412,0.588,0.0,-0.5994,0.2003,negative,12:42:37,2024-02-18,Feb,7,18,Sunday,12:00
10,e5e4ba8e-ff8e-499f-8f00-8a66d22168aa,a bad racist and biased application towards foreign parties which i do not recommend,1,2,2024-06-15 10:52:19,en,"[a, bad, racist, and, biased, application, towards, foreign, parties, which, i, do, not, recommend]",14,"[bad, racist, biased, application, towards, foreign, parties, recommend]","[bad, racist, biased, application, towards, foreign, party, recommend]","{'neg': 0.547, 'neu': 0.327, 'pos': 0.126, 'compound': -0.8406}",0.547,0.327,0.126,-0.8406,0.0797,negative,10:52:19,2024-06-15,Jun,24,15,Saturday,10:00
16,a35f663e-2089-48fd-acf6-371a7fd74478,a bit disappointed because when you installed again the draft will disappear,2,1,2024-02-24 21:53:27,en,"[a, bit, disappointed, because, when, you, installed, again, the, draft, will, disappear]",12,"[bit, disappointed, installed, draft, disappear]","[bit, disappointed, installed, draft, disappear]","{'neg': 0.357, 'neu': 0.643, 'pos': 0.0, 'compound': -0.6124}",0.357,0.643,0.0,-0.6124,0.6938,negative,21:53:27,2024-02-24,Feb,8,24,Saturday,21:00


## dataset

In [22]:
comment = comment[['reviewId','content','word_count','score','thumbsUpCount','lemmatized_words','sentiment_overal','sentiment',
                   'comment_time','comment_date','comment_month','comment_week','comment_day','comment_weekday','time_of_day']]

In [23]:
comment.columns

Index(['reviewId', 'content', 'word_count', 'score', 'thumbsUpCount',
       'lemmatized_words', 'sentiment_overal', 'sentiment', 'comment_time',
       'comment_date', 'comment_month', 'comment_week', 'comment_day',
       'comment_weekday', 'time_of_day'],
      dtype='object')

In [24]:
comment.to_csv('comment.csv',index = False)

## word countribute

### create dataset

In [25]:
all_words = [word for comment in comment['lemmatized_words'] for word in comment]
freq_dist = FreqDist(all_words)

# Display the most common words
print(freq_dist.most_common(10))

# Create a frequency distribution
freq_dist = FreqDist(all_words)

# Convert frequency distribution to DataFrame
all_word_contribute = pd.DataFrame(freq_dist.items(), columns=['Word', 'Frequency'])

# Sort the DataFrame by frequency in descending order
all_word_contribute = all_word_contribute.sort_values(by='Word', ascending=False)

[('app', 13455), ('good', 12583), ('tiktok', 10922), ('nice', 5330), ('like', 5065), ('video', 4856), ('love', 3713), ('account', 3128), ('please', 2939), ('open', 2278)]


In [26]:
def clean_exaggerated_words(word):
    # Replace any character that repeats 2 or more times with just that character
    return re.sub(r'(.)\1{2,}', r'\1', word)

all_word_contribute['Word_non_repeated_letter'] = all_word_contribute['Word'].apply(clean_exaggerated_words)

In [27]:
# cleanning stop words 
stop_words = set(stopwords.words('english'))
all_word_contribute['is_stopword'] = all_word_contribute['Word_non_repeated_letter'].isin(stop_words)
all_word_contribute = all_word_contribute[all_word_contribute['is_stopword'] == False]
all_word_contribute.sort_values('Word', ascending=True, inplace=True)

In [28]:
# only take the word len from 2 letters to 19 letters
all_word_contribute = all_word_contribute[all_word_contribute['Word_non_repeated_letter'].str.len().between(2, 21)]

In [29]:
all_word_contribute.to_csv('all_word_contribute.csv', index=False)

### word contribute dataset

In [30]:
# these file clean on google sheet 
gather_word = pd.read_csv('all_word_countribute - Sheet2.csv')

In [31]:
gather_word.head()

Unnamed: 0,Word,Frequency,Word_non_repeated_letter,is_stopword,Gather_word,Category
0,FALSE,1,FALSE,False,Not for analytics,Not for analytics
1,aa,7,aa,False,Not for analytics,Not for analytics
2,aahhhhhh,1,aah,False,Not for analytics,Not for analytics
3,aalaoal,1,aalaoal,False,Not for analytics,Not for analytics
4,aalis,1,aalis,False,Not for analytics,Not for analytics


In [32]:
# Grouping by 'gather_word' and converting groups to lists
grouped_words = gather_word[gather_word['Gather_word'] != 'Not for analytics'].groupby('Gather_word')['Word_non_repeated_letter'].apply(list).to_dict()
# Printing the result
# for key, value in grouped_words.items():
#     print(f"{key} = {value}")


In [33]:
# Function to create a list of dictionary keys that match words in lemmatized_words
def analyze_words(words, group_dict):
    matches = []
    for key, word_list in group_dict.items():
        if any(word in word_list for word in words):
            matches.append(key)
    return matches

# Apply the function to create the 'word_analyze' column
comment['word_analyze'] = comment['lemmatized_words'].apply(lambda x: analyze_words(x, grouped_words))


In [34]:
comment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54810 entries, 3 to 72348
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reviewId          54810 non-null  object 
 1   content           54810 non-null  object 
 2   word_count        54810 non-null  int64  
 3   score             54810 non-null  int64  
 4   thumbsUpCount     54810 non-null  int64  
 5   lemmatized_words  54810 non-null  object 
 6   sentiment_overal  54810 non-null  float64
 7   sentiment         54810 non-null  object 
 8   comment_time      54810 non-null  object 
 9   comment_date      54810 non-null  object 
 10  comment_month     54810 non-null  object 
 11  comment_week      54810 non-null  UInt32 
 12  comment_day       54810 non-null  int32  
 13  comment_weekday   54810 non-null  object 
 14  time_of_day       54810 non-null  object 
 15  word_analyze      54810 non-null  object 
dtypes: UInt32(1), float64(1), int32(1), int64(3),

In [35]:
comment.head()

Unnamed: 0,reviewId,content,word_count,score,thumbsUpCount,lemmatized_words,sentiment_overal,sentiment,comment_time,comment_date,comment_month,comment_week,comment_day,comment_weekday,time_of_day,word_analyze
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,9,5,0,"[app, tiktok, funny, thre, bad]",2.20705,positive,16:31:21,2024-01-15,Jan,3,15,Monday,16:00,"[bad, fun]"
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,9,5,0,"[appreciate, tiktok, make, happy]",2.8753,positive,10:01:36,2024-03-02,Mar,9,2,Saturday,10:00,"[appreciate, happy]"
8,c30f9c22-6811-401c-a407-66a81ce052fc,a bad medsos they only think about money n a bias media,12,1,0,"[bad, medsos, think, money, n, bias, medium]",0.2003,negative,12:42:37,2024-02-18,Feb,7,18,Sunday,12:00,"[bad, bias, money]"
10,e5e4ba8e-ff8e-499f-8f00-8a66d22168aa,a bad racist and biased application towards foreign parties which i do not recommend,14,1,2,"[bad, racist, biased, application, towards, foreign, party, recommend]",0.0797,negative,10:52:19,2024-06-15,Jun,24,15,Saturday,10:00,"[bad, bias, racism, recommend]"
16,a35f663e-2089-48fd-acf6-371a7fd74478,a bit disappointed because when you installed again the draft will disappear,12,2,1,"[bit, disappointed, installed, draft, disappear]",0.6938,negative,21:53:27,2024-02-24,Feb,8,24,Saturday,21:00,"[disappear, disappoint, draft, installation]"


In [36]:
comment_breakdown_word = comment[['reviewId','content','word_analyze','comment_date','score','sentiment']]

In [37]:
gather_word.head()

Unnamed: 0,Word,Frequency,Word_non_repeated_letter,is_stopword,Gather_word,Category
0,FALSE,1,FALSE,False,Not for analytics,Not for analytics
1,aa,7,aa,False,Not for analytics,Not for analytics
2,aahhhhhh,1,aah,False,Not for analytics,Not for analytics
3,aalaoal,1,aalaoal,False,Not for analytics,Not for analytics
4,aalis,1,aalis,False,Not for analytics,Not for analytics


In [38]:
comment_breakdown_word = comment_breakdown_word.explode('word_analyze').dropna(subset=['word_analyze'])

In [39]:
# create Category dictionary
word_to_category = pd.Series(gather_word['Category'].values, index = gather_word['Gather_word']).to_dict()

# map to dataset
comment_breakdown_word['Category'] = comment_breakdown_word['word_analyze'].map(word_to_category)

In [40]:
comment_breakdown_word.to_csv('comment_breakdown_word.csv',index=False)

In [41]:
comment_breakdown_word.head()

Unnamed: 0,reviewId,content,word_analyze,comment_date,score,sentiment,Category
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,bad,2024-01-15,5,positive,feeling
3,caedee84-09b4-476e-811c-2ff015557f58,a app tiktok is funny but thre are bad,fun,2024-01-15,5,positive,feeling
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,appreciate,2024-03-02,5,positive,feeling
4,b3737e33-4107-4d20-977a-3e796f958160,a appreciate the tiktok because she make me happy,happy,2024-03-02,5,positive,feeling
8,c30f9c22-6811-401c-a407-66a81ce052fc,a bad medsos they only think about money n a bias media,bad,2024-02-18,1,negative,feeling


In [42]:
# word frequency
word_breakdown_frequency = comment_breakdown_word.groupby('word_analyze').size().reset_index(name='word_frequency')
word_breakdown_frequency.head()

Unnamed: 0,word_analyze,word_frequency
0,account,2433
1,action,69
2,addict,244
3,addition,557
4,advertise,243


In [43]:
# word dimention
word_breakdown = comment_breakdown_word.groupby(['word_analyze','comment_date','sentiment']).size().reset_index(name='word_frequency')
word_breakdown['months'] = pd.to_datetime(word_breakdown['comment_date']).dt.month
word_breakdown.head()
# word_breakdown.to_csv('word_breakdown.csv',index=False)

Unnamed: 0,word_analyze,comment_date,sentiment,word_frequency,months
0,account,2024-01-01,negative,6,1
1,account,2024-01-01,neutral,2,1
2,account,2024-01-01,positive,5,1
3,account,2024-01-02,negative,4,1
4,account,2024-01-02,neutral,1,1


# create word_stat
 word, frequency, month that mention, negative, positive, month 1 to 4, month from 5 to 7

In [45]:
word_frequency = word_breakdown.groupby(['word_analyze']).agg({'word_frequency' : 'sum', 'months' : 'nunique'}).reset_index()
word_frequency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   word_analyze    343 non-null    object
 1   word_frequency  343 non-null    int64 
 2   months          343 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 8.2+ KB


In [46]:
word_breakdown_sentiment = word_breakdown[['word_analyze', 'sentiment','word_frequency']].groupby(['word_analyze', 'sentiment']).agg({'word_frequency':'sum'})#.reset_index()
word_sentiment = word_breakdown_sentiment.pivot_table(index= 'word_analyze', columns='sentiment', values='word_frequency', aggfunc='sum').reset_index()
word_sentiment.info()
# word_breakdown_sentiment.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   word_analyze  343 non-null    object 
 1   negative      342 non-null    float64
 2   neutral       342 non-null    float64
 3   positive      342 non-null    float64
dtypes: float64(3), object(1)
memory usage: 10.8+ KB


In [47]:
word_14 = word_breakdown[word_breakdown['months'] <=4][['word_analyze','word_frequency']]
word_14 = word_14.groupby(['word_analyze']).agg('sum').reset_index()
word_14 = word_14.rename(columns={'word_frequency': 'word_frequency_14'})

word_57 = word_breakdown[word_breakdown['months'] >4][['word_analyze','word_frequency']]
word_57 = word_57.groupby(['word_analyze']).agg('sum').reset_index()
word_57 = word_57.rename(columns={'word_frequency': 'word_frequency_57'})

In [48]:
dataframes = [word_frequency, word_sentiment, word_14,word_57]
word_stat = dataframes[0]

for df in dataframes[1:]:
    word_stat = pd.merge(word_stat, df, on='word_analyze', how='outer')

word_stat['word_frequency_14'].fillna(1,inplace=True)

In [49]:
word_stat['trend'] = ((word_stat['word_frequency_57'] - word_stat['word_frequency_14'])/word_stat['word_frequency_14'])

In [50]:
# create Category dictionary
# word_to_category = pd.Series(gather_word['Category'].values, index = gather_word['Gather_word']).to_dict()

# map to dataset
word_stat['category'] = word_stat['word_analyze'].map(word_to_category)

In [51]:
word_stat = word_stat[word_stat['word_frequency'] > 50]

In [52]:
word_stat.head()

Unnamed: 0,word_analyze,word_frequency,months,negative,neutral,positive,word_frequency_14,word_frequency_57,trend,category
0,account,2433,7,1415.0,390.0,628.0,1179.0,1254,0.063613,feature
1,action,69,7,52.0,8.0,9.0,29.0,40,0.37931,request
2,addict,244,7,55.0,48.0,141.0,139.0,105,-0.244604,content
3,addition,557,7,164.0,131.0,262.0,295.0,262,-0.111864,request
4,advertise,243,7,156.0,39.0,48.0,118.0,125,0.059322,feature


In [53]:
word_stat.to_csv('word_stat.csv',index = False)

### word relation

In [93]:
comment[['comment_date','word_analyze']].head()

Unnamed: 0,comment_date,word_analyze
3,2024-01-15,"[bad, fun]"
4,2024-03-02,"[appreciate, happy]"
8,2024-02-18,"[bad, bias, money]"
10,2024-06-15,"[bad, bias, racism, recommend]"
16,2024-02-24,"[disappear, disappoint, draft, installation]"


In [94]:
word_relation = pd.DataFrame(comment[['comment_date','word_analyze']],columns=['comment_date','word_analyze'])

In [95]:
word_relation.head()

Unnamed: 0,comment_date,word_analyze
3,2024-01-15,"[bad, fun]"
4,2024-03-02,"[appreciate, happy]"
8,2024-02-18,"[bad, bias, money]"
10,2024-06-15,"[bad, bias, racism, recommend]"
16,2024-02-24,"[disappear, disappoint, draft, installation]"


In [96]:
from itertools import combinations

# Function to generate word pairs
def generate_word_pairs(row):
    return [(pair[0], pair[1], row['comment_date']) for pair in combinations(row['word_analyze'], 2)]

# Apply to each row to get pairs with dates
word_relation['word_pairs'] = word_relation.apply(generate_word_pairs, axis=1)

# Flatten the pairs into a single list
word_pairs_with_dates = [pair for sublist in word_relation['word_pairs'] for pair in sublist]

# Convert list to DataFrame for counting, including the date
pair_df = pd.DataFrame(word_pairs_with_dates, columns=['word1', 'word2', 'date'])

# Count occurrences of each pair by day
pair_counts_by_day = pair_df.groupby(['word1', 'word2', 'date']).size().reset_index(name='count')



In [98]:
pair_counts_by_day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255459 entries, 0 to 255458
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   word1   255459 non-null  object
 1   word2   255459 non-null  object
 2   date    255459 non-null  object
 3   count   255459 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 7.8+ MB


In [99]:
pair_counts_by_day.to_csv('word_relation.csv',index=False)