In [1]:
from datetime import datetime
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from string import punctuation

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

from nltk.stem import PorterStemmer
porter = PorterStemmer()

from nltk.probability import FreqDist

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhounanlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhounanlin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Default value (read from 'ins_account_usernames.txt') for account_usernames is a list of:
["NBCNews", 
 "FoxNews", 
 "CNN", 
 "NYTimes",
 "WashTimes", 
 "WSJ", 
 "BBCNews", 
 "Reuters",
 "APNews",
 "Time", 
 "Aljazeeraenglish", 
 "Tabletmag"]

In [2]:
with open('ins_account_usernames.txt', encoding='utf-8') as file:
    content = file.read()
    ins_usernames_list = content.split()
    
ins_usernames_list

['NBCNews',
 'FoxNews',
 'CNN',
 'NYTimes',
 'WashTimes',
 'WSJ',
 'BBCNews',
 'Reuters',
 'APNews',
 'Time',
 'Aljazeeraenglish',
 'Tabletmag']

In [3]:
# stacked_dfs_oct = pd.DataFrame()


# print("Failed to scrape.")
# print("Use backup files instead.")


# for username in ins_usernames_list:
#     df_name = "df_" + username
#     path = 'backup_ins_df_231101_new/' + df_name + ".csv"
#     exec("stacked_dfs_oct=pd.concat([stacked_dfs_oct, pd.read_csv('{}')], ignore_index=True)".format(path))


# stacked_dfs_oct = stacked_dfs_oct[stacked_dfs_oct['time_utc'] < str(datetime(2023, 11, 1))]

# stacked_dfs_oct.to_csv('stacked_ins_dfs_oct.csv', index=False)

In [4]:
# stacked_dfs_nov = pd.DataFrame()


# print("Failed to scrape.")
# print("Use backup files instead.")


# for username in ins_usernames_list:
#     df_name = "df_" + username
#     path = 'latest_ins_csv/' + df_name + ".csv"
#     exec("stacked_dfs_nov=pd.concat([stacked_dfs_nov, pd.read_csv('{}')], ignore_index=True)".format(path))



# stacked_dfs_nov.to_csv('stacked_ins_dfs_nov.csv', index=False)

In [5]:
# no scraping
# load csv from local files

stacked_dfs_oct = pd.read_csv('stacked_ins_dfs_oct.csv')
stacked_dfs_nov = pd.read_csv('stacked_ins_dfs_nov.csv')

In [6]:
stacked_dfs_oct.columns, stacked_dfs_oct.shape

(Index(['owner_username', 'url_code', 'time_utc', 'type', 'caption', 'likes',
        'comments'],
       dtype='object'),
 (3092, 7))

In [7]:
stacked_dfs_nov.columns, stacked_dfs_nov.shape

(Index(['owner_username', 'url_code', 'time_utc', 'type', 'caption', 'likes',
        'comments'],
       dtype='object'),
 (1450, 7))

In [8]:
stacked_dfs_oct['owner_username'].unique(), stacked_dfs_nov['owner_username'].unique()

(array(['NBCNews', 'FoxNews', 'CNN', 'NYTimes', 'WashTimes', 'WSJ',
        'BBCNews', 'Reuters', 'APNews', 'Time', 'Aljazeeraenglish',
        'Tabletmag'], dtype=object),
 array(['NBCNews', 'FoxNews', 'CNN', 'NYTimes', 'WashTimes', 'WSJ',
        'BBCNews', 'Reuters', 'APNews', 'Time', 'Aljazeeraenglish',
        'Tabletmag'], dtype=object))

In [9]:
stacked_dfs = pd.concat([stacked_dfs_oct, stacked_dfs_nov], ignore_index=True)
stacked_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4542 entries, 0 to 4541
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   owner_username  4542 non-null   object
 1   url_code        4542 non-null   object
 2   time_utc        4542 non-null   object
 3   type            4542 non-null   object
 4   caption         4541 non-null   object
 5   likes           4542 non-null   int64 
 6   comments        4542 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 248.5+ KB


In [10]:
# from HW2
def find_freq_tokens(tokens, display_limit):
    
    freq_dict = FreqDist(tokens)
    bigram_freq = dict()
    
    sorted_items = sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)
    top_keys = [item[0] for item in sorted_items[:display_limit]]
    
    return top_keys

In [11]:
def text_to_tokens(text):
    
    lower_str = text.lower()
    no_punc_str = ''.join([character for character in lower_str if character not in punctuation])
    no_digit_str = ''.join([character for character in no_punc_str if not character.isdigit()])
    no_special_charac = ''.join([character for character in no_digit_str if character not in ['\u2060', '”', '“']])
    no_dash = no_special_charac.replace('—', ' ').replace('-', ' ').replace('’', ' ')
    orig_tokens = word_tokenize(no_dash)
    
    tokens = [porter.stem(word) for word in orig_tokens if not word in stop_words]
    
    return tokens

In [12]:
stacked_dfs = stacked_dfs.dropna(subset=['caption'])

In [13]:
all_ins_tokens = []

for index, row in stacked_dfs.iterrows():

    tokens = text_to_tokens(row['caption'])
    all_ins_tokens += tokens

top_tokens = find_freq_tokens(all_ins_tokens, 30)
top_tokens

['link',
 'bio',
 'gaza',
 'israel',
 'isra',
 'said',
 'peopl',
 'palestinian',
 'read',
 'hama',
 'israelgazawar',
 'say',
 'tap',
 'year',
 'us',
 'attack',
 'new',
 'kill',
 'war',
 'one',
 'time',
 'bbcnew',
 'citi',
 'palestin',
 'hospit',
 'al',
 'live',
 'hous',
 'presid',
 'world']

## Pick out the posts that talks about gaza conflict
- method 1: find the posts that contain at least one of the tokens in gaza_tokens_1
- method 2: find the posts that contain at least one of the tokens in gaza_tokens_2, which has two more tokens: "war" and "attack"
- method 3: use a language model to pick

**method 1**

In [14]:
gaza_tokens_1 = ['israel',
 'gaza',
 'isra',
 'hama',
 'palestinian',
 'palestin',
 'israelgazawar',
 'israelhama',
 'israelgaza']


is_about_gaza_1 = []

for index, row in stacked_dfs.iterrows():

    tokens = text_to_tokens(row['caption'])

    found = False
    for g_token in gaza_tokens_1:
        if g_token in tokens:
            found = True
            break

    is_about_gaza_1.append(found)

stacked_dfs['is_about_gaza_1'] = is_about_gaza_1

stacked_dfs[stacked_dfs['is_about_gaza_1']==True]

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1
3,NBCNews,CzCOrYtsJJy,2023-10-30 19:33:22,GraphSidecar,A small but growing band of Arab and Jewish Is...,34640,1076,True
4,NBCNews,CzCHkNFxaZS,2023-10-30 18:31:19,GraphVideo,Thousands of people broke into several U.N war...,8218,1127,True
9,NBCNews,Cy8cIMJrLIs,2023-10-28 13:35:29,GraphSidecar,"In the city of Khan Younis, a group of men dig...",13765,2693,True
10,NBCNews,Cy6VwBBLsEh,2023-10-27 18:01:15,GraphImage,A near-total internet blackout has taken hold ...,7482,1441,True
12,NBCNews,Cy58WXuNa-5,2023-10-27 14:19:14,GraphImage,The destruction of areas of northern Gaza is v...,4058,1296,True
...,...,...,...,...,...,...,...,...
4535,Tabletmag,CzTxFuSMm95,2023-11-06 15:01:08,GraphSidecar,"From the archives: American universities, thei...",2402,37,True
4537,Tabletmag,CzL6tdaszja,2023-11-03 13:51:17,GraphSidecar,Hezbollah leader Hassan Nasrallah is currently...,1552,54,True
4538,Tabletmag,CzJ9F7pPanZ,2023-11-02 19:33:34,GraphSidecar,"Since the Houthis, Yemen’s Islamist political ...",8233,268,True
4540,Tabletmag,CzHR2zsRRvb,2023-11-01 18:37:17,GraphImage,"How did everyone get Hamas wrong, including th...",10808,391,True


In [15]:
all_ins_GAZA_tokens = []

for index, row in stacked_dfs[stacked_dfs['is_about_gaza_1']==True].iterrows():

    tokens = text_to_tokens(row['caption'])
    all_ins_GAZA_tokens += tokens

len(all_ins_GAZA_tokens)

77697

In [16]:
all_ins_GAZA_tokens = []

for index, row in stacked_dfs[stacked_dfs['is_about_gaza_1']==True].iterrows():

    tokens = word_tokenize(row['caption'])
    all_ins_GAZA_tokens += tokens

len(all_ins_GAZA_tokens)

153412

**method 2**

In [17]:
gaza_tokens_2 = ['israel',
 'gaza',
 'isra',
 'hama',
 'palestinian',
 'attack',
 'palestin',
 'war',
 'israelgazawar',
 'israelhama',
 'israelgaza']


is_about_gaza_2 = []

for index, row in stacked_dfs.iterrows():

    tokens = text_to_tokens(row['caption'])

    found = False
    for g_token in gaza_tokens_2:
        if g_token in tokens:
            found = True
            break

    is_about_gaza_2.append(found)

stacked_dfs['is_about_gaza_2'] = is_about_gaza_2

stacked_dfs[stacked_dfs['is_about_gaza_2']==True]

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
3,NBCNews,CzCOrYtsJJy,2023-10-30 19:33:22,GraphSidecar,A small but growing band of Arab and Jewish Is...,34640,1076,True,True
4,NBCNews,CzCHkNFxaZS,2023-10-30 18:31:19,GraphVideo,Thousands of people broke into several U.N war...,8218,1127,True,True
9,NBCNews,Cy8cIMJrLIs,2023-10-28 13:35:29,GraphSidecar,"In the city of Khan Younis, a group of men dig...",13765,2693,True,True
10,NBCNews,Cy6VwBBLsEh,2023-10-27 18:01:15,GraphImage,A near-total internet blackout has taken hold ...,7482,1441,True,True
12,NBCNews,Cy58WXuNa-5,2023-10-27 14:19:14,GraphImage,The destruction of areas of northern Gaza is v...,4058,1296,True,True
...,...,...,...,...,...,...,...,...,...
4535,Tabletmag,CzTxFuSMm95,2023-11-06 15:01:08,GraphSidecar,"From the archives: American universities, thei...",2402,37,True,True
4537,Tabletmag,CzL6tdaszja,2023-11-03 13:51:17,GraphSidecar,Hezbollah leader Hassan Nasrallah is currently...,1552,54,True,True
4538,Tabletmag,CzJ9F7pPanZ,2023-11-02 19:33:34,GraphSidecar,"Since the Houthis, Yemen’s Islamist political ...",8233,268,True,True
4540,Tabletmag,CzHR2zsRRvb,2023-11-01 18:37:17,GraphImage,"How did everyone get Hamas wrong, including th...",10808,391,True,True


In [18]:
df_diff = stacked_dfs[stacked_dfs['is_about_gaza_1'] != stacked_dfs['is_about_gaza_2']]
df_diff

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
55,NBCNews,CybrNO-LmsF,2023-10-15 20:12:15,GraphSidecar,Raised on her family’s farm in conservative ru...,3814,325,False,True
139,FoxNews,CzFHtn5uNrz,2023-10-31 22:30:13,GraphImage,OFF THE GRID: The map is catching attention af...,14203,2119,False,True
142,FoxNews,CzE2jF7Lu8A,2023-10-31 20:00:14,GraphImage,TERROR 'INSPIRATION': FBI Director Christopher...,24035,4235,False,True
147,FoxNews,CzEUUsvpE4U,2023-10-31 15:01:45,GraphVideo,'FUNNY HOW THAT WORKS': Jesse Watters reacts t...,8367,673,False,True
558,FoxNews,CyXrxHyN4Vh,2023-10-14 07:00:11,GraphImage,WARNING SHOT: Fox News host Sean Hannity offer...,15620,1491,False,True
...,...,...,...,...,...,...,...,...,...
3885,BBCNews,CzeyiUVMRvO,2023-11-10 21:46:16,GraphVideo,The US used two F-15 aircrafts to launch a ‘se...,38233,3010,False,True
3903,BBCNews,CzZLB3ktijh,2023-11-08 17:23:57,GraphSidecar,“My teeth caught his eyelid.”\n\nAn Australian...,56913,544,False,True
4002,APNews,Czjjgj8JwEU,2023-11-12 18:10:21,GraphSidecar,Millions of Indians celebrated Diwali on Sunda...,4331,161,False,True
4159,Aljazeeraenglish,CzpjCl-tu72,2023-11-15 02:01:49,GraphImage,Russian and Syrian regime attacks have killed ...,25102,1113,False,True


In [19]:
len(df_diff)

88

In [20]:
for index, row in df_diff.iterrows():

    print(row['caption'])
    print("------------")

Raised on her family’s farm in conservative rural Nebraska in the 1960s and ‘70s, Ashley Swartz took her cues about life from her surroundings: At home, the men worked the farm; at play, boys roughhoused and participated in sports; at church, there was good or there was evil.

Swartz identified more with girls – wanting to play the female roles when playing make-believe and wishing to be considered pretty rather than handsome.

Swartz, like many older transgender Americans, had hoped their younger counterparts would not be faced with the same challenges and despair. But now, watching growing political attacks and legislative efforts targeting trans people, they are fearful about what the future holds for the next generation, particularly those living in politically conservative states.

Read more at the link in bio.

📷️ @madelinecass for @NBCNews
------------
OFF THE GRID: The map is catching attention after some Chinese internet users noticed the country in the middle of a war is nowh

**Method 3**

## Export the stacked dataframe to csv

In [21]:
stacked_dfs.to_csv('stacked_ins_df.csv', index=False)