In [2]:
from redditscraper.database import SQLiteConnection

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jabb0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jabb0/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
conn = SQLiteConnection('../metabase-data/2020.db')
df = conn.read_db_into_pandas()

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
tokenizer_stop = lambda row: [w for w in tokenizer.tokenize(row['title']) if w not in stop_words]

In [8]:
df['tokenized_sents'] = df.apply(tokenizer_stop, axis=1)

In [9]:
df.head()

Unnamed: 0,id,created_utc,author,author_fullname,domain,title,url,upvote_ratio,score,removed_by_category,downloaded,download_file,downloaded_at,tokenized_sents
0,fsogsk,1585699212,OptiCharts,t2_59v5qxws,youtu.be,Coronavirus outbreak timeline,https://youtu.be/J9Z12Ts7yRA,0.67,1,moderator,0,,,"[Coronavirus, outbreak, timeline]"
1,fsohei,1585699267,tribunaonlinetq,t2_51kpk3ob,tribunaonline.net,Em Taquaritinga (SP): Publicitário faz declara...,https://www.tribunaonline.net/em-taquaritinga-...,1.0,1,reddit,0,,,"[Em, Taquaritinga, SP, Publicitário, faz, decl..."
2,fsohtz,1585699310,reddaredevil,t2_16sqlp,reuters.com,28 Texas spring-breakers test positive for cor...,https://www.reuters.com/article/us-health-coro...,1.0,7,moderator,0,,,"[28, Texas, spring, breakers, test, positive, ..."
3,fsoi6m,1585699346,arsenal_is_best,t2_nq31avb,thehill.com,Trump says he wouldn't have acted differently ...,https://thehill.com/homenews/administration/49...,1.0,11,moderator,0,,,"[Trump, says, acted, differently, coronavirus,..."
4,fsoizv,1585699431,[deleted],,ndtv.com,US Planning To Impose Ban On Travel From Brazi...,https://www.ndtv.com/world-news/coronavirus-pa...,0.8,31,deleted,0,,,"[US, Planning, To, Impose, Ban, On, Travel, Fr..."


In [10]:
df_interested=df[['tokenized_sents', 'removed_by_category']]

In [11]:
df_interested.head()

Unnamed: 0,tokenized_sents,removed_by_category
0,"[Coronavirus, outbreak, timeline]",moderator
1,"[Em, Taquaritinga, SP, Publicitário, faz, decl...",reddit
2,"[28, Texas, spring, breakers, test, positive, ...",moderator
3,"[Trump, says, acted, differently, coronavirus,...",moderator
4,"[US, Planning, To, Impose, Ban, On, Travel, Fr...",deleted


In [12]:
from collections import Counter

In [36]:
removed_counts = Counter()
non_removed_counts = Counter()

# Do word counting
for _, row in df_interested.iterrows():
    if row["removed_by_category"] is not None:
        for word in row["tokenized_sents"]:
            removed_counts[word] += 1
    else:
        for word in row["tokenized_sents"]:
            non_removed_counts[word] += 1

In [37]:
non_removed_counts.most_common(10)

[('Trump', 16315),
 ('China', 16239),
 ('The', 15012),
 ('says', 14041),
 ('US', 13791),
 ('Hong', 10520),
 ('Kong', 10443),
 ('coronavirus', 10015),
 ('U', 9596),
 ('S', 8884)]

In [38]:
removed_counts.most_common(10)

[('Trump', 28464),
 ('The', 23576),
 ('https', 22435),
 ('coronavirus', 21606),
 ('19', 17580),
 ('China', 17367),
 ('de', 16609),
 ('googul', 16575),
 ('US', 16143),
 ('2020', 15430)]

In [39]:
total = removed_counts + non_removed_counts

In [40]:
removed_counts.subtract(non_removed_counts)

In [41]:
removed_counts.most_common(10)

[('https', 21440),
 ('googul', 16575),
 ('de', 14523),
 ('2020', 13776),
 ('co', 13058),
 ('Trump', 12149),
 ('coronavirus', 11591),
 ('19', 11257),
 ('Coronavirus', 10374),
 ('xyz', 10009)]

In [53]:
more_removed_dict = dict(removed_counts)
total_dict = dict(total)

removed_ratios = {}

for k, v in more_removed_dict.items():
    if total_dict[k] > 100:
        removed_ratios[k] = v / total_dict[k]

In [43]:
# more_removed_dict has positive entries if the word is more removed than not and negative if it is less removed than not
# removed ratios scales this by the total occurences of a word
more_removed_lst = sorted(more_removed_dict.items(), key=lambda x: x[1])

In [45]:
# Least removed
more_removed_lst[:40]

[('Hong', -5432),
 ('Kong', -5416),
 ('2019', -3445),
 ('Iran', -2777),
 ('UK', -2620),
 ('Syria', -2230),
 ('deal', -2203),
 ('Brexit', -2199),
 ('EU', -2162),
 ('climate', -2133),
 ('Turkey', -2023),
 ('government', -1929),
 ('protests', -1700),
 ('UN', -1693),
 ('protesters', -1635),
 ('says', -1613),
 ('Japan', -1592),
 ('Saudi', -1553),
 ('Russia', -1508),
 ('Kashmir', -1487),
 ('trade', -1442),
 ('Israel', -1380),
 ('said', -1306),
 ('Korea', -1277),
 ('minister', -1108),
 ('leader', -1084),
 ('oil', -1073),
 ('military', -1044),
 ('forces', -970),
 ('police', -966),
 ('Germany', -938),
 ('talks', -907),
 ('war', -901),
 ('protest', -900),
 ('attack', -892),
 ('Israeli', -879),
 ('nuclear', -849),
 ('Syrian', -843),
 ('Turkish', -829),
 ('Johnson', -816)]

In [46]:
# Most removed
more_removed_lst[:-40:-1]

[('https', 21440),
 ('googul', 16575),
 ('de', 14523),
 ('2020', 13776),
 ('co', 13058),
 ('Trump', 12149),
 ('coronavirus', 11591),
 ('19', 11257),
 ('Coronavirus', 10374),
 ('xyz', 10009),
 ('The', 8564),
 ('tt', 8344),
 ('ift', 8344),
 ('COVID', 7593),
 ('Crack', 6209),
 ('GOOgul', 5981),
 ('Download', 5783),
 ('Free', 5335),
 ('usa', 5015),
 ('How', 5000),
 ('2KUz4QH', 4965),
 ('la', 4880),
 ('Covid', 4848),
 ('2', 4416),
 ('1', 4322),
 ('A', 4162),
 ('I', 4060),
 ('en', 3985),
 ('Market', 3982),
 ('0', 3781),
 ('Key', 3617),
 ('New', 3564),
 ('Biden', 3459),
 ('5', 3434),
 ('To', 3400),
 ('News', 3340),
 ('3', 3305),
 ('র', 3065),
 ('In', 3027)]

In [54]:
removed_ratios_lst = sorted(removed_ratios.items(), key=lambda x: x[1])

In [55]:
# Least removed
# You can see that these are outliers that are irrelevant
removed_ratios_lst[:40]

[('9538', -1.0),
 ('6588', -1.0),
 ('EncontraBrasil', -1.0),
 ('Stade', -1.0),
 ('FINALS', -1.0),
 ('coupons', -0.9953051643192489),
 ('00pm', -0.9949937421777222),
 ('BST', -0.994475138121547),
 ('丨', -0.9878419452887538),
 ('Macaque', -0.9864864864864865),
 ('گردد', -0.9655172413793104),
 ('00am', -0.9651162790697675),
 ('coupon', -0.9627507163323782),
 ('Equestria', -0.9552238805970149),
 ('QuickBooks', -0.9461883408071748),
 ('Kawhi', -0.9421965317919075),
 ('Pity', -0.9236641221374046),
 ('Pinned', -0.9212121212121213),
 ('EST', -0.9198396793587175),
 ('Sep', -0.9177377892030848),
 ('961', -0.9130434782608695),
 ('Observers', -0.9101123595505618),
 ('DAZN', -0.9041916167664671),
 ('EncontraSaoPaulo', -0.8865979381443299),
 ('Venue', -0.8846597462514417),
 ('YPG', -0.8811881188118812),
 ('Warriors', -0.8737373737373737),
 ('Gulberg', -0.8701298701298701),
 ('Offseason', -0.8596491228070176),
 ('Deus', -0.856353591160221),
 ('Kyrie', -0.855072463768116),
 ('Astros', -0.8522167487684

In [56]:
# Most removed
# You can see that these are outliers that are irrelevant
removed_ratios_lst[-40:]

[('sns', 1.0),
 ('정치', 1.0),
 ('음악', 1.0),
 ('음악중심', 1.0),
 ('Coney', 1.0),
 ('την', 1.0),
 ('Kenosha', 1.0),
 ('agoimage', 1.0),
 ('Checkersaga', 1.0),
 ('义云高大师', 1.0),
 ('第三世多杰羌佛', 1.0),
 ('画作以每尺', 1.0),
 ('万美元成交', 1.0),
 ('万元流标', 1.0),
 ('義雲高大師', 1.0),
 ('獲英頒授', 1.0),
 ('XenForo', 1.0),
 ('οι', 1.0),
 ('με', 1.0),
 ('το', 1.0),
 ('ο', 1.0),
 ('στα', 1.0),
 ('στο', 1.0),
 ('Τι', 1.0),
 ('τον', 1.0),
 ('θα', 1.0),
 ('τα', 1.0),
 ('κρούσματα', 1.0),
 ('Κορονοϊός', 1.0),
 ('Τα', 1.0),
 ('Το', 1.0),
 ('των', 1.0),
 ('στον', 1.0),
 ('Ο', 1.0),
 ('η', 1.0),
 ('Θεσσαλονίκη', 1.0),
 ('τη', 1.0),
 ('σήμερα', 1.0),
 ('a7', 1.0),
 ('KAILASA', 1.0)]