In [None]:
import os
import re
import path
import pandas as pd
import matplotlib.pyplot as plt
import json

from wordcloud import WordCloud
from nltk.corpus import stopwords

In [None]:
plt.rcParams['figure.figsize']  = (3.33, 5.5)
plt.rcParams['axes.labelsize']  = 16
plt.rcParams['axes.titlesize']  = 18
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['lines.linewidth'] = 3

In [None]:
pwd = "../data/00_raw"
filenames = os.listdir(pwd)

PERSPECTIVE_CHAR_LIMIT = 3000
n_small_corpus = 0
n_curse_words = 0

for file in filenames:
    with open(os.path.join(pwd, file)) as f:
        content = f.read()
    
    if len(content) <= PERSPECTIVE_CHAR_LIMIT:
        n_small_corpus += 1
    
    n_curse_words += len(re.findall("\[ __ \]+", content))

In [None]:
n_files = len(filenames)

print("Basic Statistics")
print("----------------")
print(f"  - Percentage of data with less than 5000 characters is {n_small_corpus/n_files*100:0.2f}%")
print(f"  - Average number of curse words is {n_curse_words/n_files:0.2f} per file")

---

In [None]:
df_download_info = pd.read_csv("../data/download_descriptions.csv")

print("Basic Statistics")
print("----------------")
print(f"  - There are {df_download_info.movie.nunique()} different movies")

---

In [None]:
pwd = "../data/03_scored/without_curse_words"
filenames = os.listdir(pwd)

print(f"  - There are {len(filenames)} different review videos")

---

In [None]:
pwd = "../data/01_preprocessed/without_curse_words"
filenames = os.listdir(pwd)

corpus_sizes = []
for file in filenames:
    with open(os.path.join(pwd, file)) as f:
        content = f.read()
    
    corpus_sizes.append(len(content))

In [None]:
plt.subplots(figsize=(12,8))
plt.hist(corpus_sizes, bins=40)
plt.show()

In [None]:
plt.subplots(figsize=(9,7))
plt.boxplot(corpus_sizes)
plt.show()

---

In [None]:
df = pd.read_csv("../data/download_descriptions.csv")

data = df.groupby("group").movie.count()

plt.subplots(figsize=(12,8))
plt.bar(data.index, data.values/df.shape[0])
plt.title("Group Proportion")
plt.show()

---

In [None]:
group_to_content = {}

pwd = "../data/03_scored/without_curse_words"
df = pd.read_csv("../data/download_descriptions.csv")
for group in df.group.unique():
    dfs = []
    for _, row in df[df.group == group].iterrows():
        video_id_and_channel = row["url"].split("v=")[1]
        video_id = video_id_and_channel.split("&")[0]

        try:
            dfs.append(pd.read_csv(os.path.join(pwd, video_id)))
        except:
            pass

    df_group = pd.concat(dfs)
    
    df_group = df_group.sort_values(by="score", ascending=False)[:100]
    group_to_content[group] = ' '.join(df_group.text.values)
    
    del dfs

In [None]:
output_path = "../data/imgs"
fig, axs = plt.subplots(2, 2, figsize=(10,6))

for i, (group, content) in enumerate(group_to_content.items()):
    wordcloud = WordCloud(
        stopwords=stopwords.words("english"),
        background_color="white",
        width=1600, height=800
    ).generate(content)

    axs[i//2, i%2].imshow(wordcloud, interpolation='bilinear')
    axs[i//2, i%2].set_title(f"{group}")
    axs[i//2, i%2].set_axis_off()

plt.tight_layout()
plt.savefig(os.path.join(output_path, "wordclouds.pdf"))
plt.show()

---

In [None]:
group_to_content = {}
curse_words = json.load(open("../data/bad_words_scored.json"))
curse_words = [key for key, value in curse_words.items() if value > 0.2]

pwd = "../data/03_scored/with_curse_words"
df = pd.read_csv("../data/download_descriptions.csv")
for group in df.group.unique():
    dfs = []
    for _, row in df[df.group == group].iterrows():
        video_id_and_channel = row["url"].split("v=")[1]
        video_id = video_id_and_channel.split("&")[0]

        try:
            dfs.append(pd.read_csv(os.path.join(pwd, video_id)))
        except:
            pass

    df_group = pd.concat(dfs)
    group_to_content[group] = ' '.join(df_group.text.values)
    
    del dfs

In [None]:
curse_words = open("../data/bad_words.txt").read().split()
n_curse_per_group = {}

for group, content in group_to_content.items():
    for word in content.split():
        if word in curse_words:
            n_curse_per_group[group] = n_curse_per_group.get(group, 0) + 1
    
    n_curse_per_group[group] = n_curse_per_group[group]/len(content.split())

In [None]:
sorted(n_curse_per_group.items(), key=lambda item: item[1])