# Exploratory data analysis

In [None]:
!pip install wordcloud
!pip install seaborn
!pip install re

## Import dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns 
import math 
from matplotlib.pyplot import figure
import matplotlib.ticker as tkr
from matplotlib.ticker import FuncFormatter
import re
import string
%matplotlib inline 

## Data Preparation

In [None]:
# Read data csv
filename = 'Data/all_keywords.csv'
df_data = pd.read_csv(filename).drop_duplicates()

In [None]:
def normalize(str_input):
    str_input = str(str_input)
    if str_input[-1].lower() == 'k':
        return float(str_input[:-1])*1000
    else : 
        return float(str_input)

In [None]:
df_data.head()

In [None]:
df_data = df_data.dropna(subset=['Text'])

In [None]:
df_data['Timestamp'] = pd.to_datetime(df_data['Timestamp'])
df_data['Timestamp'] = df_data['Timestamp'].dt.date
df_data['Timestamp'] = pd.to_datetime(df_data['Timestamp'])

df_data['Comments'] = df_data['Comments'].fillna('0')
df_data['Comments'] = df_data['Comments'].apply(normalize)

df_data['Likes'] = df_data['Likes'].fillna('0')
df_data['Likes'] = df_data['Likes'].apply(normalize)

df_data['Retweets'] = df_data['Retweets'].fillna('0')
df_data['Retweets'] = df_data['Retweets'].apply(normalize)

df_data['has_image'] = df_data['Image link'].apply(lambda x:0 if x=='[]' else 1 )

In [None]:
df_data.info()

In [None]:
df_data.head()

## Exploratory data analysis

In [None]:
df_kw =  df_data[['keyword','Retweets','Comments','Likes','has_image']].groupby(['keyword']).agg({'Retweets':sum,
                                                                                            'Comments':sum,
                                                                                            'Likes':sum,
                                                                                            'has_image':sum}).reset_index()


### Class (Complaint Type) Distribution

In [None]:
df_data['keyword'].value_counts()

In [None]:
df_data['keyword'].value_counts().plot(kind='pie')

In [None]:
dd = df_data.groupby('keyword').count()
dd = dd.reset_index()
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="keyword", y="Text", data=dd)
for index, row in dd.iterrows():
    ax.text(row.name,row.Text, round(row.Text,2), color='black', ha="center")

### Class Distribution Based on Percentage

In [None]:
df_percent = ((df_data['keyword'].value_counts() / df_data['keyword'].value_counts().sum())*100)

In [None]:
df_percent

In [None]:
df_percent.plot(kind='bar')

### Number of Retweets

In [None]:
comma_fmt = FuncFormatter(lambda x, p: format(int(x), ','))
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="keyword", y="Retweets", data=df_kw)
for index, row in df_kw.iterrows():
    ax.text(row.name,row.Retweets, round(row.Retweets,0)/1000, color='black', ha="center")
for t in ax.texts:
    t.set_text(t.get_text()+' K')
ax.yaxis.set_major_formatter(tkr.FuncFormatter(lambda x, pos: '{:,.1f}'.format(x/1000) + 'K'))

### Number of Comments

In [None]:
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="keyword", y="Comments", data=df_kw)
for index, row in df_kw.iterrows():
    ax.text(row.name,row.Comments, round(row.Comments/1000,2), color='black', ha="center")
for t in ax.texts:
    t.set_text(t.get_text()+' K')
ax.yaxis.set_major_formatter(tkr.FuncFormatter(lambda x, pos: '{:,.1f}'.format(x/1000) + 'K'))

### Number of Likes

In [None]:
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="keyword", y="Likes", data=df_kw)
for index, row in df_kw.iterrows():
    ax.text(row.name,row.Likes, round(row.Likes/1000,2), color='black', ha="center")
for t in ax.texts:
    t.set_text(t.get_text()+' K')
ax.yaxis.set_major_formatter(tkr.FuncFormatter(lambda x, pos: '{:,.1f}'.format(x/1000) + 'K'))

### Number of tweets which have images

In [None]:
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="keyword", y="has_image", data=df_kw)
for index, row in df_kw.iterrows():
    ax.text(row.name,row.has_image, round(row.has_image,2), color='black', ha="center")

### Top Account 

In [None]:
df_acc = pd.DataFrame(df_data['UserName'].value_counts()).reset_index()
top = 10
df_acc.columns=['user','count']
figure(figsize=(15, 6), dpi=80)
ax = sns.barplot(x="user", y="count", data=df_acc[:top])
for index, row in df_acc[:top].iterrows():
    ax.text(row.name,row['count'], round(row['count'],2), color='black', ha="center")

In [None]:
df_data[df_data['UserName'] == '@adi_rossoneri'].head()

### Daily Trend

In [None]:
df_trend = df_data[['Timestamp']].groupby(['Timestamp']).size().reset_index(name='counts')
df_trend_kw = df_data[['Timestamp','keyword']].groupby(['Timestamp','keyword']).size().reset_index(name='counts')

In [None]:
figure(figsize=(15, 6), dpi=80)
ax = sns.lineplot(data=df_trend, x="Timestamp", y="counts", markers=True, dashes=False)
for x, y in zip(df_trend['Timestamp'], df_trend['counts']):
    plt.text(x = x,
    y = y, 
    s = '{:.0f}'.format(y), 
    color = 'black') 

### Daily Trend per Keywords 

In [None]:
figure(figsize=(15, 6), dpi=80)
sns.lineplot(data=df_trend_kw, x="Timestamp", y="counts", hue="keyword", markers=True, dashes=False)


### Word Distribution with WordCloud

In [None]:
stopword= [line.rstrip('\n') for line in open('Data/stopword.txt')]

In [None]:
def preprocessing_tweet(tweet):
    tweet = str(tweet)
    
    #remove link/url (http)
    tw_remove_link=' '.join(re.sub(r"h\w+(:).\S+", " ", tweet.lower()).split())
    
    #remove hashtag
    tw_remove_hashtag=' '.join(re.sub(r"(#).\S+", " ", tw_remove_link.lower()).split())
    
    #remove mention @
    tw_remove_mention = ' '.join(re.sub(r"(@).\S+", " ", tw_remove_hashtag.lower()).split()) 
           
    #remove cc/via/
    tw_remove_ccvia = ' '.join(re.sub(r'\b(cc|RT|rt\b):?[ ](URL|@[^ ]+)', " ", tw_remove_mention.lower()).split())
    
    #remove punctuation
    tw_remove_puc = tw_remove_ccvia.strip(string.punctuation)
    
    #remove stopword
    tw_sw_removal = filter(lambda x:x not in stopword, tw_remove_puc.split())
    
    return ' '.join(tw_sw_removal)

In [None]:
df_data['clean'] = df_data['Text'].apply(preprocessing_tweet)

In [None]:
text = ' '.join(df_data['clean'])

In [None]:
text

In [None]:
wordcloud_tweet = WordCloud(max_font_size=100, max_words=100, background_color="white").generate(text)

In [None]:
wordcloud_tweet

In [None]:
plt.figure()
plt.imshow(wordcloud_tweet, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df_data[df_data['keyword'] == 'kolak']['clean'].to_string())
figure(figsize=(15, 6), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df_data[df_data['keyword'] == 'gorengan']['clean'].to_string())
figure(figsize=(15, 6), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df_data[df_data['keyword'] == 'kurma']['clean'].to_string())
figure(figsize=(15, 6), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df_data[df_data['keyword'] == 'es dawet']['clean'].to_string())
figure(figsize=(15, 6), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df_data[df_data['keyword'] == 'sop buah']['clean'].to_string())
figure(figsize=(15, 6), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()