In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import nltk
from nltk import word_tokenize
import warnings
warnings.filterwarnings('ignore')


In [2]:
all_df = pd.read_csv("../ht_ef_datasets/combined_full_hathitrust_annotated_magazines_with_htids.csv")

In [3]:
all_df.groupby('cleaned_magazine_title')['start_issue'].nunique().sort_values(ascending=False)

cleaned_magazine_title
arab_observer                   193
tricontinental                  109
arab_observer_and_the_scribe     69
liberator                        58
the_scribe                       31
lotus                            26
solidarity                       18
afro-asian_bulletin_             17
afro-asian_and_world_affairs     16
afro-asian_peoples                8
Name: start_issue, dtype: int64

In [54]:
arab_observer_df = pd.read_csv("../original_files/arab_observer_gv_processed.csv")
tricontinental_bulletin_df = pd.read_csv(
        "../original_files/tricontinental_bulletin_gv_processed.csv")
black_panther_df = pd.read_csv("../original_files/black_panther_gv_processed.csv")

In [55]:
arab_observer_df.issue.nunique(), tricontinental_bulletin_df.title.nunique(), black_panther_df.date.nunique()

(245, 61, 354)

In [56]:
arab_observer_df['text'].str.len().sum()

42274985.0

In [57]:
black_panther_languages = black_panther_df.detected_language.value_counts().reset_index().rename(columns={"index": "language", "detected_language": "language_counts"})
black_panther_languages['magazine'] = 'Black Panther'
tricontinental_bulletin_languages = tricontinental_bulletin_df.detected_language.value_counts().reset_index().rename(columns={"index": "language", "detected_language": "language_counts"})
tricontinental_bulletin_languages['magazine'] = 'Tricontinental Bulletin'
arab_observer_languages = arab_observer_df.detected_language.value_counts().reset_index().rename(columns={"index": "language", "detected_language": "language_counts"})
arab_observer_languages['magazine'] = 'Arab Observer'
combined_magazines = pd.concat([black_panther_languages, tricontinental_bulletin_languages, arab_observer_languages])

In [58]:
languages_df = pd.read_csv("/Volumes/Samsung_T5/VersionsAndValues/data/metadata_files/iso_639_choices_directionality_wikimedia.csv")

In [59]:
combined_magazines_languages = pd.merge(combined_magazines, languages_df, left_on='language', right_on='code', how='left')
combined_magazines_languages.loc[combined_magazines_languages.language == "hmn", 'English language name'] = "Hmong"
combined_magazines_languages.loc[combined_magazines_languages.language == "hmn", 'Directionality'] = "ltr"
combined_magazines_languages.loc[combined_magazines_languages.language == "hmn", 'code'] = "hmn"
combined_magazines_languages.loc[combined_magazines_languages.language == "fil", 'English language name'] = "Filipino"
combined_magazines_languages.loc[combined_magazines_languages.language == "fil", 'Directionality'] = "ltr"
combined_magazines_languages.loc[combined_magazines_languages.language == "fil", 'code'] = "fil"

In [60]:
with_eng = alt.Chart(combined_magazines_languages).mark_bar().encode(
    x='magazine',
    y='language_counts:Q',
    color=alt.Color('English language name:N', legend=alt.Legend(title="Language", columns=3, symbolLimit=0), scale=alt.Scale(scheme='category20b')),
)
no_eng = alt.Chart(combined_magazines_languages[combined_magazines_languages.language != 'en']).mark_bar().encode(
    x='magazine',
    y=alt.Y('language_counts:Q', sort=alt.EncodingSortField(field="language_counts", op="sum", order="descending")),
    color=alt.Color('English language name:N', legend=alt.Legend(title="Language", columns=3, symbolLimit=0), scale=alt.Scale(scheme='category20b')),
    tooltip=['language', 'language_counts', 'English language name']
)

with_eng | no_eng

In [61]:
arab_observer_df['text'] = arab_observer_df.text.fillna('')
tricontinental_bulletin_df['text'] = tricontinental_bulletin_df.text.fillna('')
black_panther_df['text'] = black_panther_df.text.fillna('')

arab_observer_df['text'] = arab_observer_df['text'].astype(str)
tricontinental_bulletin_df['text'] = tricontinental_bulletin_df['text'].astype(
    str)
black_panther_df['text'] = black_panther_df['text'].astype(str)

arab_observer_df['text'] = arab_observer_df['text'].str.replace('\n', ' ')
tricontinental_bulletin_df['text'] = tricontinental_bulletin_df['text'].str.replace(
    '\n', ' ')
black_panther_df['text'] = black_panther_df['text'].str.replace('\n', ' ')

# arab_observer_df['text'] = arab_observer_df['text'].str.lower()
# tricontinental_bulletin_df['text'] = tricontinental_bulletin_df['text'].str.lower(
# )
tricontinental_bulletin_df = tricontinental_bulletin_df.rename(
    columns={'file_name': 'file_path', 'title': 'issue', 'page': 'page_number'})
black_panther_df = black_panther_df.rename(columns={'page': 'page_number'})

arab_observer_df['periodical_name'] = 'Arab Observer'
tricontinental_bulletin_df['periodical_name'] = 'Tricontinental Bulletin'
black_panther_df['periodical_name'] = 'Black Panther'


In [62]:
missing_dates = {'BlackPantherPDFs/vol_3_no_14_1969.pdf': {'day': 26, 'month': 'July', 'year': 1969},
 'BlackPantherPDFs/vol_6_no_11_1971.pdf' : {'day': 10, 'month': 'April', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_12_1971.pdf': {'day': 17, 'month': 'April', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_13_1971.pdf': {'day': 1, 'month': 'May', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_15_1971.pdf': {'day': 8, 'month': 'May', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_16_1971.pdf': {'day': 15, 'month': 'May', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_17_1971.pdf': {'day': 22, 'month': 'May', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_18_1971.pdf': {'day': 29, 'month': 'May', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_20_1971.pdf': {'day': 12, 'month': 'June', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_21_1971.pdf': {'day': 19, 'month': 'June', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_22_1971.pdf': {'day': 26, 'month': 'June', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_23_1971.pdf': {'day': 3, 'month': 'July', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_24_1971.pdf': {'day': 10, 'month': 'July', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_25_1971.pdf': {'day': 17, 'month': 'July', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_26_1971.pdf': {'day': 24, 'month': 'July', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_27_1971.pdf': {'day': 31, 'month': 'July', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_28_1971.pdf': {'day': 7, 'month': 'August', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_29_1971.pdf': {'day': 14, 'month': 'August', 'year': 1971},
 'BlackPantherPDFs/vol_6_no_30_1971.pdf': {'day': 21, 'month': 'August', 'year': 1971}
}

In [63]:
missing_df = pd.DataFrame.from_dict(missing_dates, orient='index').reset_index().rename(columns={'index': 'file_path'})
missing_df

Unnamed: 0,file_path,day,month,year
0,BlackPantherPDFs/vol_3_no_14_1969.pdf,26,July,1969
1,BlackPantherPDFs/vol_6_no_11_1971.pdf,10,April,1971
2,BlackPantherPDFs/vol_6_no_12_1971.pdf,17,April,1971
3,BlackPantherPDFs/vol_6_no_13_1971.pdf,1,May,1971
4,BlackPantherPDFs/vol_6_no_15_1971.pdf,8,May,1971
5,BlackPantherPDFs/vol_6_no_16_1971.pdf,15,May,1971
6,BlackPantherPDFs/vol_6_no_17_1971.pdf,22,May,1971
7,BlackPantherPDFs/vol_6_no_18_1971.pdf,29,May,1971
8,BlackPantherPDFs/vol_6_no_20_1971.pdf,12,June,1971
9,BlackPantherPDFs/vol_6_no_21_1971.pdf,19,June,1971


In [64]:
for _, row in missing_df.iterrows():
    file_path = row.file_path
    black_panther_df.loc[black_panther_df.file_path == file_path, 'day'] = row.day
    black_panther_df.loc[black_panther_df.file_path == file_path, 'month'] = row.month

In [65]:
black_panther_df['day'] = black_panther_df['day'].astype(int)
black_panther_df['year'] = black_panther_df['year'].astype(int)
black_panther_df['cleaned_date'] = black_panther_df.year.astype(str) + '-' + black_panther_df.month + '-' + black_panther_df.day.astype(str)
black_panther_df['cleaned_issue_date'] = pd.to_datetime(black_panther_df['cleaned_date'], format='%Y-%B-%d')

In [66]:
tricontinental_bulletin_df['date'] = tricontinental_bulletin_df.year

In [67]:
dates = {
    'Date: 11/1968': '1968-11-01',
    'Date: 1/1969': '1969-01-01',
    'Date: 2/1969': '1969-02-01',
    'Date: 8/1969': '1969-08-01',
    'Date: 7/1971': '1971-07-01',
    'Date: 5/1971': '1971-05-01',
    'Date: 8/1968': '1968-08-01',
    'Date: 12/1971': '1971-12-01',
    'Date: 7/1968': '1968-07-01',
    'Date: 7/1972': '1972-07-01', 
    'Date: 1/1971': '1971-01-01',
    'Date: 2/1972': '1972-02-01',
    'Date: 11/1971': '1971-11-01',
    'Date: 6/1968': '1968-06-01',
    'Date: 9/1968': '1968-09-01',
    'Date: 10/1969': '1969-10-01',
    'Date: 12/1969': '1969-12-01', 
    'Date: 1/1970': '1970-01-01', 
    'Date: 2/1970': '1970-02-01', 
    'Date: 9/1971': '1971-09-01',
    'Date: 9/1972': '1972-09-01', 
    'Date: 4/1972': '1972-04-01', 
    'Date: 3/1969': '1969-03-01', 
    'Date: 3/1972': '1972-03-01',
    'Date: 5/1966': '1966-05-01', 
    'Date: 3/1970': '1970-03-01', 
    'Date: 5/1996': '1996-05-01',
    'Date: 1/1972': '1972-01-01',
}

In [68]:
exclude_dates = [ 'Year: 1999',  'Date: 5/1996', 'Year: 2004', 'Year: 2000']

In [69]:
tricontinental_bulletin_df = tricontinental_bulletin_df[tricontinental_bulletin_df.year.isin(
    exclude_dates) == False]


In [70]:
tricontinental_bulletin_df.date.replace(
    dates, inplace=True)


In [71]:
tricontinental_bulletin_df['issue_number'] = tricontinental_bulletin_df.issue.str.extract(
    r'(\d+)')


In [72]:
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1969') & (tricontinental_bulletin_df.issue_number == '11'), 'date'] = '1969-05-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1971') & (tricontinental_bulletin_df.issue_number == '21'), 'date'] = '1971-02-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1971') & (tricontinental_bulletin_df.issue_number == '25'), 'date'] = '1971-08-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1973') & (tricontinental_bulletin_df.issue_number == '33'), 'date'] = '1973-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date.isna()) & (tricontinental_bulletin_df.issue_number == '1'), 'date'] = '1966-04-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date.isna()) & (tricontinental_bulletin_df.issue_number == '3'), 'date'] = '1966-06-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date.isna()) & (tricontinental_bulletin_df.issue_number == '4'), 'date'] = '1966-07-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1968') & (tricontinental_bulletin_df.issue_number == '7'), 'date'] = '1968-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1969') & (tricontinental_bulletin_df.issue_number == '15'), 'date'] = '1968-07-01'


In [73]:
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1967') & (tricontinental_bulletin_df.issue_number == '11'), 'date'] = '1967-02-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1967') & (tricontinental_bulletin_df.issue_number.isna()), 'date'] = '1967-04-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1969') & (tricontinental_bulletin_df.issue_number == '10'), 'date'] = '1969-04-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1970') & (tricontinental_bulletin_df.issue_number == '18'), 'date'] = '1969-12-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1970') & (tricontinental_bulletin_df.issue_number == '17'), 'date'] = '1969-11-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1971') & (tricontinental_bulletin_df.issue_number == '26'), 'date'] = '1971-08-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1971') & (tricontinental_bulletin_df.issue_number == '23'), 'date'] = '1971-04-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1972') & (tricontinental_bulletin_df.issue_number == '29'), 'date'] = '1972-05-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1972') & (tricontinental_bulletin_df.issue_number == '31'), 'date'] = '1972-06-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1972') & (tricontinental_bulletin_df.issue_number == '74'), 'date'] = '1972-12-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1973') & (tricontinental_bulletin_df.issue_number == '82'), 'date'] = '1973-02-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1973') & (tricontinental_bulletin_df.issue_number == '84'), 'date'] = '1973-04-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1975') & (tricontinental_bulletin_df.issue_number.isna()), 'date'] = '1975-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1976') & (tricontinental_bulletin_df.issue_number == '49'), 'date'] = '1976-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1976') & (tricontinental_bulletin_df.issue_number == '101'), 'date'] = '1976-12-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1977') & (tricontinental_bulletin_df.issue_number == '104'), 'date'] = '1977-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1977') & (tricontinental_bulletin_df.issue_number == '107'), 'date'] = '1977-12-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1980') & (tricontinental_bulletin_df.issue_number == '67'), 'date'] = '1980-01-01'
tricontinental_bulletin_df.loc[(
    tricontinental_bulletin_df.date == 'Year: 1980') & (tricontinental_bulletin_df.issue_number == '69'), 'date'] = '1980-12-01'


In [74]:
# subset_df = tricontinental_bulletin_df[tricontinental_bulletin_df.date.str.contains('Year', na=False)][['date', 'issue_number']].drop_duplicates()

# subset_df['extracted_year'] = subset_df.date.str.extract(r'(\d+)')
# subset_df['extracted_year'] = subset_df['extracted_year'].astype(int)
# subset_df.sort_values(by=['extracted_year', 'issue_number'])

In [75]:
arab_observer_df['cleaned_issue_date'] = pd.to_datetime(
    arab_observer_df['issue'], errors='coerce')


In [76]:
tricontinental_bulletin_df['cleaned_issue_date'] = pd.to_datetime(tricontinental_bulletin_df['date'], errors='coerce')


In [77]:
revolutionary_terms = ["third world", "revolution", "liberation", "imperialism"]

In [89]:
arab_observer_df['tokenized_text'] = arab_observer_df['text'].apply(lambda x: nltk.word_tokenize(x))
tricontinental_bulletin_df['tokenized_text'] = tricontinental_bulletin_df['text'].apply(lambda x: nltk.word_tokenize(x))
black_panther_df['tokenized_text'] = black_panther_df['text'].apply(lambda x: nltk.word_tokenize(x))

arab_observer_df['tokenized_length'] = arab_observer_df.tokenized_text.str.len()
tricontinental_bulletin_df['tokenized_length'] = tricontinental_bulletin_df.tokenized_text.str.len()
black_panther_df['tokenized_length'] = black_panther_df.tokenized_text.str.len()



In [90]:
counts_ao = arab_observer_df.copy()
counts_tb = tricontinental_bulletin_df.copy()
counts_bp = black_panther_df.copy()


In [91]:
import re

In [92]:
counts_ao['struggle_counts'] = counts_ao['text'].str.count('struggle')
counts_tb['struggle_counts'] = counts_tb['text'].str.count('struggle')
counts_bp['struggle_counts'] = counts_bp['text'].str.count('struggle')


In [93]:
grouped_ao = counts_ao[['periodical_name', 'struggle_counts', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])['struggle_counts'].sum().reset_index().sort_values(by='cleaned_issue_date', ascending=True)

In [94]:
grouped_tb = counts_tb[['periodical_name', 'struggle_counts', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])[
    'struggle_counts'].sum().reset_index().sort_values(by=['cleaned_issue_date'], ascending=True)

grouped_bp = counts_bp[['periodical_name', 'struggle_counts', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])['struggle_counts'].sum().reset_index().sort_values(by='cleaned_issue_date', ascending=True)

In [96]:
total_ao = counts_ao[['periodical_name', 'tokenized_length', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])['tokenized_length'].sum().reset_index().sort_values(by='cleaned_issue_date', ascending=True)
total_tb = counts_tb[['periodical_name', 'tokenized_length', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])['tokenized_length'].sum().reset_index().sort_values(by='cleaned_issue_date', ascending=True)
total_bp = counts_bp[['periodical_name', 'tokenized_length', 'cleaned_issue_date', 'page_number']].groupby(['periodical_name', 'cleaned_issue_date'])['tokenized_length'].sum().reset_index().sort_values(by='cleaned_issue_date', ascending=True)

In [98]:
merged_ao = pd.merge(total_ao, grouped_ao, on=['periodical_name', 'cleaned_issue_date'], how='inner')
merged_tb = pd.merge(total_tb, grouped_tb, on=['periodical_name', 'cleaned_issue_date'], how='inner')
merged_bp = pd.merge(total_bp, grouped_bp, on=['periodical_name', 'cleaned_issue_date'], how='inner')

In [99]:
merged_ao['struggle_ratio'] = merged_ao['struggle_counts'] / merged_ao['tokenized_length']
merged_tb['struggle_ratio'] = merged_tb['struggle_counts'] / merged_tb['tokenized_length']
merged_bp['struggle_ratio'] = merged_bp['struggle_counts'] / merged_bp['tokenized_length']

In [101]:
grouped_df = pd.concat([merged_ao, merged_tb, merged_bp])

alt.Chart(grouped_df).mark_bar(size=1).encode(
    x='cleaned_issue_date:T',
    y='struggle_ratio:Q',
    color='periodical_name:N'
)

In [102]:
tw_ao_df = arab_observer_df[arab_observer_df.text.str.contains('|'.join(revolutionary_terms))]
tw_tb_df = tricontinental_bulletin_df[tricontinental_bulletin_df.text.str.contains(
    '|'.join(revolutionary_terms))]
tw_bp_df = black_panther_df[black_panther_df.text.str.contains('|'.join(revolutionary_terms))]


In [103]:
len(tw_tb_df), len(tw_ao_df[tw_ao_df.cleaned_issue_date.dt.year >= 1963]), len(tw_ao_df[tw_ao_df.cleaned_issue_date.dt.year <= 1963]) , len(tw_bp_df)


(1758, 1575, 1439, 3417)

In [104]:
tw_ao_df = arab_observer_df[arab_observer_df.text.str.contains(
    'bloc')]
tw_tb_df = tricontinental_bulletin_df[tricontinental_bulletin_df.text.str.contains(
    'bloc')]
tw_bp_df = black_panther_df[black_panther_df.text.str.contains('bloc')]


In [105]:
ao_bloc = arab_observer_df[arab_observer_df.text.str.contains(
    'bloc|alignment')]
ao_non_bloc  = arab_observer_df[arab_observer_df.text.str.contains(
    'bloc|alignment') == False]
ao_bloc = ao_bloc[ao_bloc.tokenized_length > 100]
ao_non_bloc = ao_non_bloc[ao_non_bloc.tokenized_length > 100].sample(frac=0.15)
len(ao_bloc), len(ao_non_bloc)

(1245, 1410)

In [None]:
tri_rev = tricontinental_bulletin_df[(tricontinental_bulletin_df.text.str.contains('revolution')) & (tricontinental_bulletin_df.tokenized_length > 100)]
tri_nonrev = tricontinental_bulletin_df[(tricontinental_bulletin_df.text.str.contains('revolution') == False) & (tricontinental_bulletin_df.tokenized_length > 100)]

In [None]:
tw_tb_df = tw_tb_df[tw_tb_df.tokenized_length > 100]
tw_ao_df = tw_ao_df[tw_ao_df.tokenized_length > 100]

In [None]:
len(tw_ao_df), len(tw_tb_df)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# tw_ao_df = tw_ao_df[tw_ao_df.cleaned_issue_date.dt.year <= 1963]


In [None]:
ao_bloc['classify'] = 'Contains Non-Aligned Blocs'
ao_non_bloc['classify'] = 'Control'
tri_rev['classify'] = 'Contains Revolution'
tri_nonrev['classify'] = 'Control'

In [None]:
def create_corpora(first_df, second_df, class_field):
    full_tw_df = pd.concat([first_df[['issue', 'page_number', 'text', 'periodical_name', 'cleaned_issue_date', 'classify']], second_df[['issue', 'page_number', 'text', 'periodical_name', 'cleaned_issue_date', 'classify']]], axis=0)
    full_tw_df['class_number'] = 0
    full_tw_df.loc[full_tw_df.classify == class_field, 'class_number'] = 1
    full_tw_df.class_number = full_tw_df.class_number.astype(int)
    full_corpus_df = pd.concat([arab_observer_df[arab_observer_df.tokenized_length > 100][['issue', 'page_number', 'text', 'periodical_name']], tricontinental_bulletin_df[tricontinental_bulletin_df.tokenized_length > 100][['issue', 'page_number', 'text', 'periodical_name']]])
    return full_tw_df, full_corpus_df

In [None]:
ao_subset_df, ao_corpus_df = create_corpora(ao_bloc, ao_non_bloc, 'Control')
tri_subset_df, tri_corpus_df = create_corpora(tri_rev, tri_nonrev, 'Control')

In [None]:
def train_model(df, file_name, full_corpus_df, max_features=1000):
    df = shuffle(df)
    y = df['class_number']
    category_id_df = df[['classify', 'class_number']
                        ].drop_duplicates().sort_values('class_number')
    category_to_id = dict(category_id_df.values)
    id_to_category = dict(category_id_df[['classify', 'class_number']].values)
    labels = y
    tfidf_model = TfidfVectorizer(max_df=.9, min_df=1, use_idf=True, norm=None, stop_words=stopwords.words(
        'english'), ngram_range=(1, 2), max_features=max_features)
    features = tfidf_model.fit_transform(df.text.tolist())

    features_nd = features.toarray()

    training_features, test_features, training_target, test_target = train_test_split(
        features_nd[0:len(df['text'])], y, test_size=0.3)
    x_train, x_val, y_train, y_val = train_test_split(
        training_features, training_target, test_size=0.3, random_state=12)

    sm = SMOTE(sampling_strategy='auto')
    x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

    log_model = LogisticRegression()
    log_model = log_model.fit(X=x_train_res, y=y_train_res)
    y_pred = log_model.predict(x_val)
    print('Validation Results')
    print(log_model.score(x_val, y_val))
    print(metrics.recall_score(y_val, y_pred, average=None))
    print("Precision:", metrics.precision_score(y_val, y_pred, average=None))
    print('\nTest Results')
    print(log_model.score(test_features, test_target))
    print(metrics.recall_score(test_target,
          log_model.predict(test_features), average=None))
    print("Precision:", metrics.precision_score(
        test_target, log_model.predict(test_features), average=None))
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    scoring = 'accuracy'
    results = cross_val_score(log_model, x_train_res,
                              y_train_res, scoring='accuracy', cv=kfold)
    print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

    print(metrics.classification_report(
        y_val, y_pred, target_names=df['classify'].unique()))

    conf_mat = metrics.confusion_matrix(y_val, y_pred)
    print(conf_mat)
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(conf_mat, annot=True, fmt='d',
                xticklabels=category_id_df.classify.values, yticklabels=category_id_df.classify.values)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    # plt.show()
    fig.savefig(file_name + '_confusionmatrix.png')

    # Output a pickle file for the model

    get_most_informative_features(tfidf_model, log_model, file_name)
    classify_corpus(full_corpus_df, tfidf_model, log_model, file_name)
    joblib.dump(log_model, file_name+'_saved_logit_model.pkl')
    joblib.dump(tfidf_model, file_name+'_saved_tfidf_model.pkl')


In [None]:
# train_model(full_tw_df, '../compute_magazines/models_data/model_large_struggle_no_params', full_corpus_df, max_features=10000)

In [None]:
train_model(ao_subset_df, '../compute_magazines/models_data/model_large_ao_bloc', ao_corpus_df, max_features=10000)

In [None]:
train_model(tri_subset_df, '../compute_magazines/models_data/model_large_tri_revolution', tri_corpus_df, max_features=10000)

In [None]:
tri_features = pd.read_csv('../compute_magazines/models_data/model_large_tri_revolution_features.csv')
ao_features = pd.read_csv('../compute_magazines/models_data/model_large_ao_bloc_features.csv')

In [None]:
ao_bloc_features = ao_features[ao_features.coef_0 < 0][['coef_0', 'feature_0']]
ao_bloc_features.columns = ['coef', 'feature']
ao_bloc_features['coef'] = ao_bloc_features['coef'].abs()
ao_bloc_features = ao_bloc_features.sort_values('coef', ascending=False)
ao_nonbloc_features = ao_features[ao_features.coef_1 > 0][[
    'coef_1', 'feature_1']]
ao_nonbloc_features.columns = ['coef', 'feature']
ao_nonbloc_features['coef'] = ao_nonbloc_features['coef'].abs()
ao_nonbloc_features = ao_nonbloc_features.sort_values('coef', ascending=False)

In [None]:
ao_bloc_features.head(10)

In [None]:
ao_nonbloc_features.head(10)

In [None]:
tri_rev_features = tri_features[tri_features.coef_0 < 0][['coef_0', 'feature_0']]
tri_rev_features.columns = ['coef', 'feature']
tri_rev_features['coef'] = tri_rev_features['coef'].abs()
tri_rev_features = tri_rev_features.sort_values('coef', ascending=False)
tri_nonrev_features = tri_features[tri_features.coef_1 > 0][[
    'coef_1', 'feature_1']]
tri_nonrev_features.columns = ['coef', 'feature']
tri_nonrev_features['coef'] = tri_nonrev_features['coef'].abs()
tri_nonrev_features = tri_nonrev_features.sort_values('coef', ascending=False)

In [None]:
tri_rev_features.head(10)

In [None]:
freedomways_df = pd.read_csv("../annotated_datasets/freedomways_1961_1985_annotated.csv")

In [None]:
freedomways_df['cleaned_issue_date'] = pd.to_datetime(freedomways_df['start_issue'], errors='coerce')

In [None]:
freedomways_df = freedomways_df.rename(columns={'lowercase': 'text'})

In [None]:
freedomways_df['tokenized_text'] = freedomways_df['text'].apply(lambda x: word_tokenize(x))
freedomways_df['tokenized_length'] = freedomways_df['tokenized_text'].str.len()

In [None]:
# load model

loaded_logit_model = joblib.load('../compute_magazines/models_data/model_large_struggle_no_params_saved_logit_model.pkl')
loaded_tfidf_model = joblib.load('../compute_magazines/models_data/model_large_struggle_no_params_saved_tfidf_model.pkl')

In [None]:
# load model

ao_loaded_logit_model = joblib.load('../compute_magazines/models_data/model_large_ao_bloc_saved_logit_model.pkl')
ao_loaded_tfidf_model = joblib.load('../compute_magazines/models_data/model_large_ao_bloc_saved_tfidf_model.pkl')
# load model

tri_loaded_logit_model = joblib.load('../compute_magazines/models_data/model_large_tri_revolution_saved_logit_model.pkl')
tri_loaded_tfidf_model = joblib.load('../compute_magazines/models_data/model_large_tri_revolution_saved_tfidf_model.pkl')

In [None]:
classify_corpus(arab_observer_df[arab_observer_df.tokenized_length > 100], tri_loaded_tfidf_model, tri_loaded_logit_model, '../compute_magazines/models_data/ao_tri_revolution')

In [None]:
classify_corpus(tricontinental_bulletin_df[tricontinental_bulletin_df.tokenized_length > 100], ao_loaded_tfidf_model, ao_loaded_logit_model, '../compute_magazines/models_data/tri_ao_bloc')

In [None]:
classify_corpus(freedomways_df[freedomways_df.tokenized_length > 100], loaded_tfidf_model, loaded_logit_model, '../compute_magazines/models_data/freedomways_struggle_no_params')

In [None]:
features_df = pd.read_csv(
    "../compute_magazines/models_data/model_large_struggle_features.csv")


In [None]:
freedomways_classified = pd.read_csv("../compute_magazines/models_data/freedomways_struggle_classified_corpus.csv")

In [None]:
tw_ao_features = features_df[features_df.coef_0 < 0][['coef_0', 'feature_0']]
tw_ao_features.columns = ['coef', 'feature']
tw_ao_features['coef'] = tw_ao_features['coef'].abs()
tw_ao_features = tw_ao_features.sort_values('coef', ascending=True)
tw_tb_features = features_df[features_df.coef_1 > 0][[
    'coef_1', 'feature_1']]
tw_tb_features.columns = ['coef', 'feature']
tw_tb_features['coef'] = tw_tb_features['coef'].abs()
tw_tb_features = tw_tb_features.sort_values('coef', ascending=True)

In [None]:
tw_ao_features['classify'] = 'Arab Observer Third World'
tw_tb_features['classify'] = 'Tricontinental Bulletin Third World'

In [None]:
tw_ao_features.head(20)


In [None]:
tw_tb_features.head(20)


In [None]:
arab_observer_df.columns

In [None]:
arab_observer_df[arab_observer_df.text.str.contains('progressive')]

In [None]:
combined_features = pd.concat([tw_ao_features, tw_tb_features])

In [None]:
classified_corpus = pd.read_csv(
    "../compute_magazines/models_data/model_large_struggle_no_params_classified_corpus.csv")


In [None]:
ao_classified_corpus = pd.read_csv("../compute_magazines/models_data/ao_tri_revolution_classified_corpus.csv")
tri_classified_corpus = pd.read_csv("../compute_magazines/models_data/tri_ao_bloc_classified_corpus.csv")

In [None]:
subset_ao = classified_corpus[classified_corpus.periodical_name == "Arab Observer"]
subset_ao['page_number'] = subset_ao['page_number'].astype(int)

In [None]:
subset_tb = classified_corpus[classified_corpus.periodical_name == "Tricontinental Bulletin"]
# subset_tb['page_number'] = subset_tb['page_number'].astype(int)

In [None]:
dates_tb = pd.merge(tricontinental_bulletin_df[tricontinental_bulletin_df.tokenized_length > 100][['page_number', 'issue', 'cleaned_issue_date']], subset_tb, on=['page_number', 'issue'], how='left')

In [None]:
dates_ao = pd.merge(arab_observer_df[arab_observer_df.tokenized_length > 100][['page_number', 'issue', 'cleaned_issue_date']], subset_ao, on=['page_number', 'issue'], how='left')

In [None]:
# classified_corpus[(classified_corpus['prediction'] == 1) & (classified_corpus['periodical_name'] == 'Arab Observer')].sort_values(by=['prediction_proba_1'], ascending=True).head(20)

In [None]:
# classified_corpus[(classified_corpus['prediction'] == 0) & (classified_corpus['periodical_name']
#                                                             == 'Tricontinental Bulletin')].sort_values(by=['prediction_proba_0'], ascending=False).head(20)


In [None]:
import altair as alt

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
chart = alt.Chart(dates_ao).mark_bar().encode(
    x=alt.X('cleaned_issue_date:T', title='Date'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.Color('prediction:N', title='Prediction'),
    # tooltip=['prediction', 'prediction_proba_0', 'prediction_proba_1']
).properties(
    title='Predictions for Arab Observer'
)

chart1 = alt.Chart(dates_tb).mark_bar().encode(
    x=alt.X('cleaned_issue_date:T', title='Date'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.Color('prediction:N', title='Prediction'),
    # tooltip=['prediction', 'prediction_proba_0', 'prediction_proba_1']
).properties(
    title='Predictions for Tricontinental Bulletin'
)

chart2 = alt.Chart(freedomways_classified).mark_bar().encode(
    x=alt.X('cleaned_issue_date:T', title='Date'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.Color('prediction:N', title='Prediction'),
    # tooltip=['prediction', 'prediction_proba_0', 'prediction_proba_1']
).properties(
    title='Predictions for Freedomways'
)


alt.vconcat(*[chart, chart1, chart2])

In [None]:
chart = alt.Chart(ao_classified_corpus).mark_bar().encode(
    x=alt.X('cleaned_issue_date:T', title='Date'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.Color('prediction:N', title='Prediction'),
    # tooltip=['prediction', 'prediction_proba_0', 'prediction_proba_1']
).properties(
    title='Predictions for Arab Observer'
)

chart1 = alt.Chart(tri_classified_corpus).mark_bar().encode(
    x=alt.X('cleaned_issue_date:T', title='Date'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.Color('prediction:N', title='Prediction'),
    # tooltip=['prediction', 'prediction_proba_0', 'prediction_proba_1']
).properties(
    title='Predictions for Tricontinental Bulletin'
)

alt.hconcat(*[chart, chart1])


In [None]:
ao_classified_corpus.sort_values(by=['prediction_proba_0'], ascending=False).head(10)

In [None]:
tri_classified_corpus.sort_values(by=['prediction_proba_0'], ascending=False)[['issue', 'text', 'page_number', 'date', 'prediction_proba_0']].head(30)

In [None]:
tri_classified_corpus[(tri_classified_corpus.cleaned_issue_date > '1978-01-01') & (tri_classified_corpus.prediction == 0)].sort_values(by=['prediction_proba_0'], ascending=False)[['issue', 'text', 'page_number', 'date', 'prediction_proba_0']].head(30)