In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from pathlib import Path
from wordcloud import WordCloud, STOPWORDS

In [2]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [3]:
raw = Path('/home3/usfb/analysis/analysis-fake-news/input/reaction/1000-page/20-min/by-reaction-type/LIKE/by-page-id')
page_info_path = Path('/home3/usfb/build/output/page/1000-page-info.csv')
politifact_fake_domain_path = Path('/home3/usfb/analysis/analysis-fake-news/temp/post-match-domain/1000_page_politifact_domain.csv')

In [4]:
pageinfo = pd.read_csv(page_info_path)
politifact_with_fake_domain = pd.read_csv(politifact_fake_domain_path)
page_category = pageinfo[['page_id','category','type']]
politifact_with_fake_domain = politifact_with_fake_domain.join(page_category.set_index('page_id'), on='page_id')

In [5]:
pageinfo_separate_by_cat = pageinfo.groupby('category').mean()
pageinfo_separate_by_cat = pageinfo_separate_by_cat[['total_like','total_comment','total_share','fan_count']]
pageinfo_separate_by_cat['page_num'] = pageinfo.groupby('category').size()

In [6]:
pageinfo_separate_by_type = pageinfo.groupby('type').mean()
pageinfo_separate_by_type = pageinfo_separate_by_type[['total_like','total_comment','total_share','fan_count']]
pageinfo_separate_by_type['page_num'] = pageinfo.groupby('type').size()

In [7]:
politifact_separate_by_cat = politifact_with_fake_domain.groupby('category').mean()
politifact_separate_by_cat = politifact_separate_by_cat[['post_reactions','post_likes','post_comments','post_shares','page_talking_about_count']]
politifact_separate_by_cat['count'] = politifact_with_fake_domain.groupby('category').size()
politifact_separate_by_cat['page_num'] = politifact_with_fake_domain.groupby(['category','page_name']).size().reset_index().groupby('category').size()
politifact_separate_by_cat = politifact_separate_by_cat[politifact_separate_by_cat['count']>90]

In [8]:
politifact_separate_by_type = politifact_with_fake_domain.groupby('type').mean()
politifact_separate_by_type = politifact_separate_by_type[['post_reactions','post_likes','post_comments','post_shares','page_talking_about_count']]
politifact_separate_by_type['count'] = politifact_with_fake_domain.groupby('type').size()
politifact_separate_by_type['page_num'] = politifact_with_fake_domain.groupby(['type','page_name']).size().reset_index().groupby('type').size()

## Frequently used word for post names in different types of fanpage

### All types of fanpage

In [9]:
total_post_name = ""
for i in politifact_with_fake_domain['post_name'][:]:
    total_post_name = total_post_name + str(i) + '\n'
sentences = sent_tokenize(total_post_name)

In [10]:
tokens = word_tokenize(total_post_name)
tokens = [w.lower() for w in tokens]
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

In [11]:
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'n') for word in words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in words]
print(lemmatized[:100])

['donald', 'trump', 'think', 'kid', 'need', 'good', 'old', 'fashion', 'spank', 'agree', 'vote', 'freedom', 'daily', 'grow', 'nut', 'whoopi', 'hold', 'back', 'torch', 'cowardly', 'gops', 'refuse', 'stand', 'idiot', 'betsy', 'devos', 'video', 'home', 'invasion', 'thug', 'stab', 'man', 'wife', 'blow', 'thug', 'away', 'freedom', 'daily', 'donald', 'trump', 'think', 'kid', 'need', 'good', 'old', 'fashion', 'spank', 'agree', 'vote', 'freedom', 'daily', 'jimmy', 'carter', 'fly', 'dc', 'inauguration', 'something', 'amaze', 'protestors', 'plane', 'media', 'rag', 'city', 'biggest', 'police', 'union', 'award', 'sheriff', 'clarke', 'man', 'year', 'freedom', 'daily', 'liz', 'warren', 'strike', 'trump', 'bill', 'could', 'lead', 'impeachment', 'hell', 'yes', 'congressman', 'call', 'trump', 'racist', 'refuse', 'apologize', 'double', 'hell', 'yes', 'army', 'ranger', 'destroy', 'qb', 'colin', 'kaepernick', 'sit', 'anthem']


In [12]:
x = Counter(lemmatized)
x.most_common()

[('obama', 13990),
 ('video', 13051),
 ('trump', 12528),
 ('politics', 10934),
 ('hillary', 9868),
 ('get', 7863),
 ('watch', 7834),
 ('muslim', 7736),
 ('man', 6356),
 ('see', 5970),
 ('freedom', 5899),
 ('make', 5838),
 ('daily', 5817),
 ('cop', 5623),
 ('say', 5616),
 ('woman', 5540),
 ('black', 5454),
 ('us', 5377),
 ('find', 4885),
 ('go', 4414),
 ('doug', 4229),
 ('break', 4137),
 ('isis', 4113),
 ('new', 4064),
 ('white', 4060),
 ('muslims', 3812),
 ('take', 3480),
 ('shock', 3461),
 ('question', 3365),
 ('attack', 3337),
 ('call', 3336),
 ('girl', 3315),
 ('one', 3270),
 ('america', 3261),
 ('look', 3127),
 ('leave', 3098),
 ('giles', 3023),
 ('herald', 2954),
 ('mom', 2878),
 ('kill', 2869),
 ('police', 2838),
 ('show', 2762),
 ('gun', 2633),
 ('people', 2625),
 ('sick', 2613),
 ('clinton', 2584),
 ('want', 2519),
 ('shoot', 2445),
 ('dear', 2413),
 ('give', 2380),
 ('come', 2339),
 ('news', 2207),
 ('something', 2192),
 ('secret', 2172),
 ('catch', 2158),
 ('school', 2104),
 

### Fanpage type == "FIGURE"

In [13]:
figure_post_name = ""
for i in politifact_with_fake_domain[politifact_with_fake_domain['type']=='figure']['post_name']:
    figure_post_name = figure_post_name + str(i) + '\n'
sentences = sent_tokenize(figure_post_name)

In [14]:
tokens = word_tokenize(figure_post_name)
tokens = [w.lower() for w in tokens]
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

In [15]:
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'n') for word in words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in words]
print(lemmatized[:100])

['mcdonalds', 'pizza', 'hut', 'kfc', 'refuse', 'muslim', 'demand', 'serve', 'halal', 'meat', 'view', 'chop', 'block', 'cancel', 'fan', 'sick', 'tire', 'video', 'plan', 'parenthood', 'get', 'million', 'tax', 'money', 'spend', 'million', 'hillary', 'campaign', 'court', 'rule', 'sharia', 'civil', 'law', 'muslims', 'america', 'hey', 'soros', 'kiss', 'deplorable', 'hillary', 'police', 'intentionally', 'kill', 'black', 'men', 'france', 'find', 'mosque', 'raid', 'deeply', 'disturb', 'proof', 'trump', 'right', 'us', 'herald', 'clock', 'boy', 'family', 'leave', 'america', 'say', 'good', 'riddance', 'doug', 'giles', 'clashdaily', 'clock', 'boy', 'family', 'leave', 'america', 'say', 'good', 'riddance', 'doug', 'giles', 'clashdaily', 'massive', 'rally', 'islam', 'france', 'never', 'know', 'watch', 'cnn', 'ice', 'hit', 'sanctuary', 'city', 'philadelphia', 'arrest', 'criminal', 'illegal', 'immigrants', 'rosie', 'donnell', 'leave', 'america', 'promise']


In [16]:
x = Counter(lemmatized)
x.most_common()

[('obama', 1814),
 ('video', 1450),
 ('trump', 1156),
 ('hillary', 1151),
 ('isis', 1012),
 ('get', 899),
 ('muslim', 885),
 ('us', 824),
 ('watch', 820),
 ('man', 788),
 ('make', 697),
 ('say', 662),
 ('woman', 657),
 ('black', 609),
 ('herald', 597),
 ('attack', 569),
 ('police', 522),
 ('clinton', 520),
 ('new', 518),
 ('go', 503),
 ('muslims', 495),
 ('find', 494),
 ('take', 485),
 ('cop', 485),
 ('kill', 465),
 ('doug', 450),
 ('white', 448),
 ('break', 441),
 ('call', 440),
 ('giles', 438),
 ('see', 435),
 ('clashdaily', 421),
 ('gun', 415),
 ('shoot', 405),
 ('america', 387),
 ('state', 358),
 ('question', 357),
 ('show', 349),
 ('leave', 342),
 ('school', 339),
 ('one', 324),
 ('shock', 316),
 ('want', 312),
 ('people', 305),
 ('girl', 301),
 ('plan', 291),
 ('try', 281),
 ('give', 275),
 ('judge', 256),
 ('house', 252),
 ('report', 252),
 ('american', 246),
 ('mom', 244),
 ('refugees', 241),
 ('islam', 240),
 ('arrest', 240),
 ('bill', 236),
 ('flag', 233),
 ('like', 230),
 ('

### Fanpage type == "GROUP"

In [17]:
group_post_name = ""
for i in politifact_with_fake_domain[politifact_with_fake_domain['type']=='group']['post_name']:
    group_post_name = group_post_name + str(i) + '\n'
sentences = sent_tokenize(group_post_name)
tokens = word_tokenize(group_post_name)
tokens = [w.lower() for w in tokens]
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'n') for word in words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in words]
x = Counter(lemmatized)
x.most_common()

[('trump', 3852),
 ('obama', 3144),
 ('hillary', 2515),
 ('muslim', 2452),
 ('daily', 2383),
 ('freedom', 2375),
 ('get', 2235),
 ('watch', 2139),
 ('see', 1837),
 ('cop', 1741),
 ('video', 1698),
 ('man', 1681),
 ('make', 1563),
 ('woman', 1556),
 ('say', 1541),
 ('black', 1522),
 ('break', 1333),
 ('find', 1304),
 ('white', 1242),
 ('doug', 1141),
 ('go', 1132),
 ('us', 1111),
 ('muslims', 1093),
 ('look', 1025),
 ('girl', 1018),
 ('shock', 991),
 ('question', 954),
 ('sick', 899),
 ('take', 879),
 ('one', 874),
 ('mom', 873),
 ('call', 845),
 ('leave', 824),
 ('new', 805),
 ('liberty', 796),
 ('alliance', 776),
 ('attack', 772),
 ('america', 767),
 ('show', 729),
 ('isis', 713),
 ('herald', 692),
 ('giles', 667),
 ('people', 665),
 ('kill', 654),
 ('gun', 638),
 ('give', 638),
 ('want', 636),
 ('dear', 635),
 ('police', 629),
 ('something', 628),
 ('news', 612),
 ('come', 608),
 ('thugs', 572),
 ('word', 571),
 ('catch', 571),
 ('islam', 567),
 ('surprise', 558),
 ('try', 553),
 ('l

### Fanpage type == "MEDIA"

In [18]:
media_post_name = ""
for i in politifact_with_fake_domain[politifact_with_fake_domain['type']=='media']['post_name']:
    media_post_name = media_post_name + str(i) + '\n'
sentences = sent_tokenize(media_post_name)

In [19]:
media_post_name = ""
for i in politifact_with_fake_domain[politifact_with_fake_domain['type']=='media']['post_name']:
    media_post_name = media_post_name + str(i) + '\n'
sentences = sent_tokenize(media_post_name)
tokens = word_tokenize(media_post_name)
tokens = [w.lower() for w in tokens]
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'n') for word in words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in words]
x = Counter(lemmatized)
x.most_common()

[('politics', 10815),
 ('video', 9861),
 ('obama', 9002),
 ('trump', 7497),
 ('hillary', 6176),
 ('watch', 4869),
 ('get', 4718),
 ('muslim', 4397),
 ('man', 3822),
 ('see', 3690),
 ('make', 3568),
 ('freedom', 3476),
 ('us', 3422),
 ('daily', 3386),
 ('cop', 3385),
 ('say', 3366),
 ('woman', 3277),
 ('black', 3277),
 ('find', 3045),
 ('go', 2772),
 ('new', 2706),
 ('doug', 2633),
 ('isis', 2370),
 ('white', 2360),
 ('break', 2343),
 ('muslims', 2222),
 ('shock', 2144),
 ('take', 2104),
 ('america', 2099),
 ('one', 2072),
 ('question', 2054),
 ('call', 2040),
 ('girl', 1991),
 ('attack', 1980),
 ('leave', 1920),
 ('giles', 1913),
 ('look', 1883),
 ('mom', 1752),
 ('kill', 1735),
 ('show', 1678),
 ('police', 1666),
 ('herald', 1659),
 ('people', 1639),
 ('dear', 1573),
 ('gun', 1569),
 ('want', 1550),
 ('sick', 1544),
 ('clinton', 1532),
 ('come', 1513),
 ('secret', 1505),
 ('shoot', 1489),
 ('alternative', 1473),
 ('give', 1455),
 ('catch', 1394),
 ('news', 1387),
 ('something', 1342),

## Frequently used words for post message

In [20]:
total_post_message = ""
for i in politifact_with_fake_domain['post_message'][:]:
    total_post_message = total_post_message + str(i) + '\n'
sentences = sent_tokenize(total_post_message)
tokens = word_tokenize(total_post_message)
tokens = [w.lower() for w in tokens]
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'n') for word in words]
lemmatized = [wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in words]
x = Counter(lemmatized)
x.most_common()

[('http', 62996),
 ('get', 15685),
 ('one', 14516),
 ('obama', 14452),
 ('go', 13801),
 ('trump', 11283),
 ('make', 10641),
 ('take', 10626),
 ('video', 10519),
 ('say', 9525),
 ('think', 9070),
 ('hillary', 8711),
 ('man', 8320),
 ('see', 8187),
 ('would', 8103),
 ('like', 8032),
 ('come', 7978),
 ('people', 7879),
 ('leave', 7680),
 ('time', 7288),
 ('share', 7280),
 ('however', 6809),
 ('know', 6670),
 ('find', 6660),
 ('woman', 6620),
 ('police', 6164),
 ('muslim', 6062),
 ('new', 5954),
 ('something', 5701),
 ('look', 5587),
 ('enjoy', 5279),
 ('happen', 4992),
 ('call', 4986),
 ('could', 4828),
 ('black', 4791),
 ('want', 4664),
 ('right', 4663),
 ('watch', 4628),
 ('decide', 4573),
 ('show', 4474),
 ('state', 4384),
 ('live', 4252),
 ('way', 4194),
 ('media', 4079),
 ('many', 4064),
 ('clinton', 4032),
 ('america', 4030),
 ('nan', 4023),
 ('us', 3995),
 ('president', 3972),
 ('back', 3943),
 ('never', 3943),
 ('donald', 3931),
 ('shock', 3921),
 ('love', 3907),
 ('white', 3820),