In [None]:
# Uncomment to pip install packages if necessary

# pip install pandas
# pip install krippendorff
# pip install numpy
# pip install statsmodels
# pip install matplotlib
# pip install plotly
# pip install nltk
# pip install pyarrow

In [None]:
# Import necessary packages

import pandas as pd  # for data manipulation and analysis
import pickle  # for de-serializing Python object structures
from datetime import date, datetime # for working with dates
import numpy as np  # for numerical computing
import statsmodels.formula.api as sm  # for statistical modeling
from statsmodels.tsa.stattools import adfuller # Augmented Dickey-Fuller test function
import random # for random sampling
import itertools # for efficient looping and list processing

import matplotlib.pyplot as plt  # for plotting graphs
import matplotlib.gridspec as gridspec  # for creating complex layouts of subplots
import matplotlib.ticker as ticker  # for configuring tick locators and formatters
import matplotlib.dates as mdates  # for working with dates in Matplotlib
import matplotlib.cbook as cbook  # for utilities for working with Matplotlib
from matplotlib.ticker import PercentFormatter, FuncFormatter  # formatters for ticks
from matplotlib.patches import Rectangle  # for drawing rectangles in plots

import plotly.graph_objects as go  # for map visualization
from nltk.corpus import stopwords  # for retrieving stopwords in Russian
import krippendorff  # Import the krippendorff library
from krippendorff import alpha # to calculate krippendorff's alpha

import os # to work with the libraries
import pyarrow.feather as feather # to work with feather files 

In [None]:
# Paths to the data

# Get the current working directory
current_directory = os.getcwd()

# Construct the path to the corpus.pkl file
corpus_path = os.path.join(current_directory, "corpus.pkl")
#print(corpus_path)

# Construct the path to the validation.csv file
validation_path = os.path.join(current_directory, "validation.csv")
#print(validation_path)

In [None]:
# Defining Russian stopwords

stopwords = stopwords.words('russian')

# Defining punctuation marks 
punc = '''!()-[]{};":'"\,<>./?@#$%^&*_~,–'''

# Defining election dates for presidential terms
el1 = date(2000, 5, 7)  # beginning of the 1st presidential term
el2 = date(2004, 5, 8)  # beginning of the 2nd presidential term
el3 = date(2008, 5, 8)  # Medvedev's term - beginning
el4 = date(2012, 5, 7)  # beginning of the 3rd presidential term
el5 = date(2018, 5, 7)  # beginning of the 4th presidential term

# Storing election dates in a list
election_dates = [el1, el2, el3, el4, el5]

In [None]:
# Reading the dataset 

df = pd.read_pickle(corpus_path)

In [None]:
# First entries in the dataframe

df.head()

In [None]:
# Columns

print(df.columns)

In [None]:
# Overview of the columns

print("Column names:\n", df.columns,'\n')

# Column 'dates'
# The column provides the date of a news episode
# The code below shows that the period of coverage is from 1999-12-31 to 2022-02-23
print("First date:", df.sort_values('dates').iloc[0]['dates'])
print("Last date:", df.sort_values('dates').iloc[-1]['dates'],'\n')

# Column 'text'
# The column provides the text a news episode, including its title
# Number of episodes in the dataset
print(f"{df.shape[0]} episodes",'\n')
print("Example of an episode:\n", df['text'].iloc[12345],'\n')

# Column 'country'
# The column provides a country-topic assigned by Newsmap
print("Country-topics can have the following values:\n", sorted(list(set(df['country']))),'\n')

# Column 'sentiment_labels'
# The column provides a sentiment label assigned by Rubert-tiny
print("Three value of sentiment labels:\n", set(df['sentiment_labels']),'\n')

# Column 'sentiment_score'
# The column provides a sentiment score assigned by Rubert-tiny
print("Group means of sentiment scores:\n", df.groupby('sentiment_labels')['sentiment_score'].mean())

In [None]:
# Check if column 'dates' in the main dataframe is in increasing order 

dates_increasing = df['dates'].is_monotonic_increasing
print("Are the dates in increasing order?", dates_increasing)

In [None]:
# Creating additional columns necessary for the analysis

# (Execution time: around 15 seconds)

# Extract year from the 'dates' column and create a new column 'year'
df['year'] = [i.year for i in df['dates']]

# Create a column that equals one if a text mentions Putin
df['putin'] = [1 if 'Путин' in text else 0 for text in df['text']]
df['putin'] = df['putin'].astype(int)

# Create a column for the length of text in each news episode
df['textlength'] = [len(i.split()) for i in df['text']]

# Presidential terms
# Initialize the 'term' column with a value of 1 for all rows
df['term'] = 1  
# Update 'term' to 2 for rows where the 'dates' is greater than el2
df.loc[df['dates'] > el2, 'term'] = 2
# Update 'term' to 'Medvedev' for rows where the 'date' is greater than el3
df.loc[df['dates'] > el3, 'term'] = 'Medvedev'
# Update 'term' to 3 for rows where the 'date' is greater than el4
df.loc[df['dates'] > el4, 'term'] = 3
# Update 'term' to 4 for rows where the 'date' is greater than el5
df.loc[df['dates'] > el5, 'term'] = 4  

In [None]:
# Read csv file with validation results
v = pd.read_csv(validation_path)

# Small dataframes with the validation results are below
# They will be used later in the script

# Subset to obtain the sample to calculate the accuracy of both Rubert-tiny and Newsmap
sa = v[v['sample'] == 'accuracy_ml']
#  Subset to obtain the sample used to estimate the precision of Rubert Tiny
pr = v[v['sample']=='precision_rubert']
# Subset to obtain the sample used to estimate the precision of Newsmap
pn = v[v['sample']=='precision_newsmap']
# Subset to obtain the sample used to estimate the accuracy of the dictionaries
ad = v[v['sample']=='accuracy_dict']
# Subset to obtain the sample used to estimate the precision of the dictionaries
pdi = v[v['sample']=='precision_dict']

In [None]:
# Section: Abstract

# Text in the abstract: 
# " Using 385,981 news transcripts " 

# Check if the text is the same as in validation file
print("Using", df.shape[0], 'news transcripts')

In [None]:
# Section: Introduction 

# Text in the article: 
# " On average, 16% of news about events in Russia and 7% of stories about foreign affairs and events abroad
#   refer to the ruler, which amounts to 9% and 3% of total reports from Channel One, respectively. " 

df1 = df[df['putin'] == 1] # create a dataframe for only those stories that involve 'Putin' (Путин)

# 16%
putin_ru = df1[df1['country'] == 'ru'] # stories that involve Putin and labelled as covering Russia
df_ru = df[df['country'] == 'ru'] # all stories labelled as covering Russia
percentage_putin_ru = round((putin_ru.shape[0] / df_ru.shape[0])*100)
print(f"{percentage_putin_ru}% of news about events in Russia")

# 7%
putin_foreign = df1[df1['country'] != 'ru'] # stories that involve Putin and labelled as not covering Russia
df_f = df[df['country'] != 'ru'] # all stories labelled as not covering Russia
percentage_putin_foreign = round((putin_foreign.shape[0] / df_f.shape[0])*100)
print(f"{percentage_putin_foreign}% of stories about foreign affairs")

# 9%
percentage_total_ru = round((putin_ru.shape[0] / df.shape[0])*100)
print(f"which amounts to {percentage_total_ru}%")

# 3%
percentage_total_foreign = round((putin_foreign.shape[0] / df.shape[0])*100)
print(f"and {percentage_total_foreign}% of total reports from Channel One")

In [None]:
# Section: Introduction 

# Text in the article:
# " At the same time, I document a steady and significant increase—from 3% in 2012 to 18% in 2021
#   —in the share of news about foreign affairs and events originating abroad that mentioned the ruler. " 

# The shares of news about foreign affairs and events originating abroad that mentioned the ruler
# Define a list 'years' containing years from 2012 to 2021
years = [year for year in range(2012, 2022)]

# Iterate through each year in the list 'years'
for year in years:
    # Subset the DataFrame 'df' for the current year
    df1 = df[df['year'] == year]

    # Subset further to include only rows where the 'country' column is not 'ru' (foreign news)
    f = df1[df1['country'] != 'ru']

    # Further subset to include only rows where the 'putin' column is 1 (foreign stories that mention Putin)
    fp = f[f['putin'] == 1]

    # Calculate the percentage of foreign stories mentioning Putin compared to all foreign news this year
    percentage_foreign_with_putin = round((fp.shape[0] / f.shape[0]) * 100)

    # Print the result
    print(f"and {percentage_foreign_with_putin}% was the share of news reports about foreign affairs and events abroad that mentioned the ruler in {year}")


In [None]:
# Section "Data and Estimation Strategy"

# Text in the article: 
# " My analysis relies on a corpus comprised of 385,981 transcripts of news reports
#   transmitted on Channel One between December 31,1999, the day when Putin became an acting
#   President of Russia, and February 23, 2022, the day before Russia’s full-scale invasion of Ukraine "

print(df.shape[0])
print("First date:", df.sort_values('dates').iloc[0]['dates'])
print("Last date:", df.sort_values('dates').iloc[-1]['dates'],'\n')

In [None]:
# Section "Data and Estimation Strategy"

# Text in the article: 
# " Stories that mentioned Vladimir Putin were identified using keyword search "

# This refers to column 'putin' that has already been defined above.
# Code from above is copy-pasted here

# Create a column that equals one if the text mentions Putin
# df['putin'] = [1 if 'Путин' in text else 0 for text in df['text']]
# df['putin'] = df['putin'].astype(int)

In [None]:
# Section "Data and Estimation Strategy"

# Text in the article: 
# " The stories are all in Russian and, on average, are 235 (SD  = 231) words long "

# Calculate the rounded mean of the 'textlength' column in the DataFrame 'df'
rounded_mean = round(df['textlength'].mean())

# Calculate the rounded standard deviation of the 'textlength' column in the DataFrame 'df'
rounded_std = round(df['textlength'].std())

print(f"The stories are all in Russian and, on average, are {rounded_mean} (SD  = {rounded_std}) words long")

In [None]:
# Section "Data and Estimation Strategy"

# Text in the article:
# " To verify transcripts’ accuracy, I randomly selected 39 episodes (0.01% of the data) 
#   from the website and cross-referenced the video content with the text "

random_selection = df.sample(n = 39, random_state = 42)  

# procedure: open the website of channel one for the exact date
# https://www.1tv.ru/news/issue/year-month-day, watch the episode, compare to the ['text']

In [None]:
# Section "Validation of the Classification Results"

# Text in the article: 
# " Two annotators, fluent in Russian ....
# ...al unlabeled samples from the corpus (n=600)"

# Small dataframes with the validation results are below
# They will be used later in the script

# The size of the sample to calculate the accuracy of both Rubert-tiny and Newsmap
# plus the size of sample used to estimate the precision of Rubert Tiny
# plus the soze of sample used to estimate the precision of Newsmap

print(sa.shape[0] + pr.shape[0] + pn.shape[0])

In [None]:
# Section "Validation of the Classification Results"

# Text in the article: 
# " Each sample was comprised from 100 stories that were randomly selected from the corpus,
#   aiming to estimate the accuracy of Rubert-tiny and Newsmap ."

# Sample to calculate accuracy of both Rubert-tiny and Newsmap

# Set a random seed to ensure the reproducibility of samples
random.seed(42)
# Check if the dataset is sorted so that column 'dates' in the main dataframe is in increasing order 
dates_increasing = df['dates'].is_monotonic_increasing
print("The dates in increasing order?", dates_increasing)

# Generate 100 random numbers between 0 and 385981
random_numbers = random.sample(range(0, df.shape[0]), 100)

# Sample to calculate the precision of Rubert-tiny
# Select rows from the DataFrame using random numbers and assign to sample_all
sample = df.loc[random_numbers]
# Keep only the 'dates', 'country', and 'sentiment_labels' columns in sample_all
sample = sample[['dates', 'text', 'sentiment_labels', 'country']]

# Initialize the 'labels' column to 0
sample['labels'] = 0
# Set 'labels' to 1 where 'sentiment_labels' is 'positive'
sample.loc[sample['sentiment_labels'] == 'positive', 'labels'] = 1
# Set 'labels' to -1 where 'sentiment_labels' is 'negative'
sample.loc[sample['sentiment_labels'] == 'negative', 'labels'] = -1

# The code below can be used to save the sample to .csv
# sample.to_csv("insert a path here")
# the output file is called sample_accuracy.csv

# Display the first few rows of the DataFrame 
sample.head()

# Count of stories 
print(sample.shape[0])

# Check if the text is the same as in validation file
print("Same as validation file?", list(sa['text']) == list(sample['text']))

In [None]:
# Section "Validation of the Classification Results"

# Text in the article: 
#  " and 300 stories that were randomly
#    selected from the subsets of the corpus for each sentiment-class, aiming to
#    estimate the precision for “positive,” “negative,” and “neutral” classes" 

# Sample used to estimate the precision of Rubert Tiny

# Set a random seed to ensure the reproducibility of samples
random.seed(42)
# Check if the dataset is sorted so that column 'dates' in the main dataframe is in increasing order 
dates_increasing = df['dates'].is_monotonic_increasing
print("Are the dates in increasing order?", dates_increasing)


def sample_sentiment(df, sentiment, label, sample_size = 100):
    """
    Samples a given number of entries from the DataFrame for a specific sentiment.
    
    Parameters:
    df (DataFrame): The input DataFrame.
    sentiment (str): The sentiment label to filter by ('negative', 'neutral', 'positive').
    label (int): The label to assign to the sampled entries.
    sample_size (int): The number of samples to draw.
    
    Returns:
    DataFrame: A DataFrame with the sampled entries.
    """
    filtered_df = df[df['sentiment_labels'] == sentiment].reset_index(drop=True)
    #print(f'{sentiment.capitalize()} stories, count:', filtered_df.shape[0])
    random_numbers = random.sample(range(filtered_df.shape[0]), sample_size)
    sample_df = filtered_df.loc[random_numbers].copy()
    sample_df = sample_df[['dates', 'text', 'sentiment_labels', 'country']]
    sample_df['labels'] = label
    return sample_df

# Sample from each sentiment category
sample_negative = sample_sentiment(df, 'negative', -1)
sample_neutral = sample_sentiment(df, 'neutral', 0)
sample_positive = sample_sentiment(df, 'positive', 1)

# Stack the DataFrames vertically
sample = pd.concat([sample_negative, sample_neutral, sample_positive], axis = 0).reset_index(drop = True)

# The code below can be used to save the sample to .csv
# sample.to_csv("insert a path here")
# the output file is called precision_rubert.csv
# Display the first few rows of the DataFrame
sample.head()

# Count of stories 
print(sample.shape[0])

# Check if the text is the same as in validation file
print("Same as validation file?", list(pr['text']) == list(sample['text']))

In [None]:
# Section "Validation of the Classification Results"

# Text in the article:
# " Based on the labeling, the accuracy score of Rubert-tiny amounted to 81%" 

# (Krippendorff's alpha is also reported in A5 in the Appendix)

# Calculate Accuracy of Rubert Tiny and Krippendorff's alpha

# Check if 'label' matches 'Coder1' and calculate the proportion of matches. Print the result
print("Accuracy of Rubert-tiny", (sa['value'] == sa['coder1']).mean())

# Labelling data
data = {
    'coder1': sa['coder1'].astype(int),
    'coder2': sa['coder2'].astype(int)
}

rd = pd.DataFrame(data)

# Prepare the reliability data for Krippendorff's alpha
reliability_data = [rd['coder1'].tolist(), rd['coder2'].tolist()]

# Calculate Krippendorff's alpha
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha: {alpha}")

In [None]:
# Section "Validation of the Classification Results"

# Text in the article:
# " while the precision amounted to 81% for “negative,”
#  82% for “neutral,” and 95% for “positive” classes%" 

# Calculate Precision of Rubert Tiny and Krippendorff's alpha

# Calculate precision for each sentiment category
for sentiment in ['negative', 'neutral', 'positive']:
    subset = pr[pr['sentiment_labels'] == sentiment]
    precision = (subset['value'] == subset['coder1']).mean()
    print(f"Precision, {sentiment}: {precision}")
    
# Labelling data
data = {
    'coder1': pr['coder1'].astype(int),
    'coder2': pr['coder2'].astype(int)
}

rd = pd.DataFrame(data)

# Prepare the reliability data for Krippendorff's alpha
reliability_data = [rd['coder1'].tolist(), rd['coder2'].tolist()]

# Calculate Krippendorff's alpha
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha: {alpha}")

In [None]:
# Section "Validation of the Classification Results"

# Text in the article:
# " The estimate of the accuracy score for Newsmap amounted to 89% " 

print("Accuracy of Newsmap", (sa['country'] == sa['c1']).mean())

In [None]:
# Text in the article:
# " while the precision estimates for the labels “Russia,” “US,” “Ukraine,”
#    “United Kingdom” were 100%, 84%, 94%, and 86% respectively"

# Filter data by country-label
ru = pn[pn['country'] == 'ru']
us = pn[pn['country'] == 'us']
ua = pn[pn['country'] == 'ua']
gb = pn[pn['country'] == 'gb']

# Function to calculate and print precision
def print_precision(country_code, df):
    precision = (df['value'] == df['coder1']).mean()
    print(f"Precision of Newsmap, {country_code.upper()}: {precision}")

# Print precision for each country-label
print_precision('ru', ru)
print_precision('us', us)
print_precision('ua', ua)
print_precision('gb', gb)

# Labeling data
data = {
    'coder1': pn['coder1'].astype(int),
    'coder2': pn['coder2'].astype(int)
}

dat = pd.DataFrame(data)

# Prepare the reliability data for Krippendorff's alpha
reliability_data = [dat['coder1'].tolist(), dat['coder2'].tolist()]

# Calculate Krippendorff's alpha
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha: {alpha}")

In [None]:
# Figure 1. Corpus from Channel One

# Data for Figure 1 

# Days within the period under consideration
# Create a DataFrame containing all dates from '1999-12-31' to '2022-02-24'
all_days = pd.DataFrame(pd.date_range(start = '1999-12-31', end = '2022-02-24'))

# Extract only the date portion from the DataFrame and store them in a list
all_days = [i.date() for i in all_days[0]]

# Create a new DataFrame from the list of dates
all_days = pd.DataFrame(all_days)

# Rename the column to 'date'
all_days = all_days.rename(columns = {0: "dates"})

# Set the 'date' column as the index of the DataFrame
all_days = all_days.set_index("dates")

# Episodes per day
# Add a new column 'episodes_daily' to the DataFrame 'df' and set all values to 1
df['episodes_daily'] = 1  

# Group the DataFrame 'df' by the 'date' column, summing up the 'episodes_daily' values for each date
# Create a new DataFrame 'counts_all' to store the resulting counts
counts_all = pd.DataFrame(df.groupby(by = "dates")['episodes_daily'].sum())

# Episodes per day about RU -- domestic news
# Filter the DataFrame 'df' to include only rows where the 'country' column is 'ru' (Russia)
ru = df[df['country'] == 'ru']

# Add a new column 'episodes_daily_ru' to the filtered DataFrame 'ru' and set all values to 1
ru['episodes_daily_ru'] = 1  

# Group the filtered DataFrame 'ru' by the 'date' column, summing up the 'episodes_daily_ru'
# values for each date and reate a new DataFrame 'counts_daily_ru' to store the counts
counts_daily_ru = pd.DataFrame(ru.groupby(by = "dates")['episodes_daily_ru'].sum())

# Concatenate the DataFrames 'all_days', 'counts_all', and 'counts_daily_ru'
dat = pd.concat([all_days, counts_all, counts_daily_ru], axis = 1)

# Fill missing values in the concatenated DataFrame 'dat' with 0
dat = dat.fillna(0)

# Calculate the share of daily episodes related to Russia ('episodes_daily_ru')
# compared to all daily episodes ('episodes_daily')
dat['episodes_daily_ru_share'] = dat['episodes_daily_ru'] / dat['episodes_daily']

# For the pie chart
# Count the number of rows in DataFrame 'df' where the 'country' column is 'ru',
# 'us', 'ua', and 'gb' respectively
ru = df[df['country'] == 'ru'].shape[0]
us = df[df['country'] == 'us'].shape[0]
ua = df[df['country'] == 'ua'].shape[0]
gb = df[df['country'] ==' gb'].shape[0]

# Calculate the count of rows where the 'country' column is other than 'ru',
# 'us', 'ua', and 'gb'
other = df.shape[0] - (ru + us + ua + gb)

# Create lists of countries and their respective episode counts
countries = ['ru', 'us', 'ua', 'gb', 'other']
episodes = [ru, us, ua, gb, other]

# Create a summary DataFrame with 'countries' and 'episodes' columns
summary = pd.DataFrame()
summary['countries'] = countries
summary['episodes'] = episodes

In [None]:
# Drawing Figure 1. Corpus from Channel One

# Create a figure with a specific size and white background
fig = plt.figure(figsize = (15, 5), facecolor = 'white')

# Update font family for the plot
plt.rcParams.update({'font.family' : 'Times New Roman'})

# Create subplots using grid layout, defining their positions and sizes
ax1 = plt.subplot2grid((2, 6), (0, 0), colspan = 5, rowspan = 1)
ax2 = plt.subplot2grid((2, 6), (0, 5), colspan = 2, rowspan = 2)
ax3 = plt.subplot2grid((2, 6), (1, 0), colspan = 5, rowspan = 1)

# Plot the daily counts of news reports in the corpus on ax1
# Actual data
ax1.plot(dat['episodes_daily'], color = 'black')  
# Rolling mean
ax1.plot(dat['episodes_daily'].rolling(window = 50).mean(), color = 'lightgray', linewidth = 1)
# Remove x-axis ticks
ax1.set_xticks([])
# Set the title
ax1.set_title("News reports in the corpus (daily counts)", fontsize = 17)  

# Add vertical dashed lines indicating election dates
for i in election_dates:
    ax1.axvline(x = i, color = 'dimgray', linestyle = "dashed", linewidth = 1)

# Set tick parameters for the y-axis
ax1.tick_params(axis = 'y', labelsize = 13)  

# Create a pie chart on ax2 showing the distribution of episodes by country
ax2.pie(summary['episodes'],
        colors = ['dimgray', 'darkgrey', 'silver', 'lightgray', 'whitesmoke', 'white'],
        labels = ['RU', 'US', 'UA', 'GB', 'Other'],
        textprops = {'fontsize': 14},
        labeldistance = 1.17,
        radius = 1.2)

# Set the title for ax2
ax2.set_title("Country labels \n in the corpus \n ", fontsize = 16)

# Plot the rolling mean of the share of daily episodes related to Russia on ax3
ax3.plot(dat['episodes_daily_ru_share'].rolling(window = 50).mean(), color = 'dimgray', linewidth = 2)

# Set major locator and formatter for x-axis, and formatter for y-axis on ax3
# Set major locator for months
ax3.xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
# Set major formatter for y-axis to percent
ax3.yaxis.set_major_formatter(PercentFormatter(1))
# Customize y-axis tick labels to percent
ax3.yaxis.set_major_formatter(plt.FuncFormatter(lambda val, _: '{:.0%}'.format(val)))
# Set tick parameters for y-axis
ax3.tick_params(axis = 'y', labelsize = 13)
# Set tick parameters for x-axis with rotation
ax3.tick_params(axis = 'x', rotation = 70, labelsize = 13)  

# Add vertical dashed lines indicating election dates on ax3
for i in election_dates:
    ax3.axvline(x = i, color = 'dimgray', linestyle = "dashed", linewidth = 1)

# Set the title for ax3
ax3.set_title("Domestic news reports in the corpus (daily shares)", fontsize = 17)

# Create a new figure with white background for the legend
plt.figure(facecolor = 'white')

# Plot lines with labels for the legend
plt.plot([0, 0], color = 'black', label = "Episodes, daily", linewidth = 7)
plt.plot([0, 0], color = 'lightgray', label = 'Episodes, daily (a moving average, n = 50)', linewidth = 7)
plt.plot([0, 0], color = 'dimgray', label = 'Share of daily newsflow (a moving average, n = 50)', linewidth = 7)
plt.plot([0, 0], color = 'black', linestyle = "dashed", label='New presidential term', linewidth = 3)

# Add legend with specified properties and position
plt.legend(prop = {'size': 35}, bbox_to_anchor = (1.1, 1.05))

# Show the legend
plt.show()

In [None]:
# Table 1 "Sentiment analysis: classification results"

# Create 'domestic' column
df['domestic'] = (df['country'] == 'ru').astype(int)

# Get unique years
unique_years = df['year'].unique()

# Create a loop to assign values to 'year_dummies' column
for year in unique_years:
    df.loc[df['year'] == year, 'year_dummies'] = f'y{year}'

# Table 1, Model 1
print ('\n =============================== Model 1 =============================== \n')
result = sm.ols(formula="sentiment_score~putin ", data=df).fit(cov_type='HC3')  
print(result.summary())

# Table 1, Model 2
print ('\n =============================== Model 2 =============================== \n')
result = sm.ols(formula="sentiment_score~putin+domestic ", data=df).fit(cov_type='HC3')  
print(result.summary())

# Table 1, Model 3
print ('\n =============================== Model 3 =============================== \n')
result = sm.ols(formula="sentiment_score~putin+domestic+year_dummies", data=df).fit(cov_type='HC3')  
print(result.summary())

# Table 1, Model 4
print ('\n =============================== Model 4 =============================== \n')
result = sm.logit(formula="putin ~ sentiment_labels", data=df).fit()
print(result.summary())

# Table 1, Model 5
print ('\n =============================== Model 5 =============================== \n')
result = sm.logit(formula="putin ~ sentiment_labels+domestic+year_dummies", data=df).fit()
print(result.summary())

# Table 1, Model 6
print ('\n =============================== Model 6 =============================== \n')
df1 = df[df['sentiment_labels']!='neutral']
result = sm.logit(formula="putin ~ sentiment_labels+ domestic + year_dummies", data=df1).fit()
print(result.summary())

In [None]:
# Data for figures 2, 3, 4 and 5 and Table 2

# Days within the period under consideration
# Generate a DataFrame containing all dates from '1999-12-31' to '2022-02-24'
all_days = pd.DataFrame(pd.date_range(start = '1999-12-31', end = '2022-02-24'))

# Extract date values from the DataFrame and create a new DataFrame
all_days = [i.date() for i in all_days[0]]
all_days = pd.DataFrame(all_days)

# Rename the column to "dates" and set it as the index of the DataFrame
all_days = all_days.rename(columns={0: "dates"})
all_days = all_days.set_index("dates")

# Episodes per day
df['episodes_daily'] = 1
counts_all = pd.DataFrame(df.groupby(by = "dates")['episodes_daily'].sum())

# Counts - mentions of Putin
putin = pd.DataFrame(df.groupby(by = "dates")['putin'].sum())
putin.rename(columns = {'putin' : "putin_episodes_daily"},
             inplace = True)
# Foreign
foreign = df[df['country'] != 'ru']
foreign = pd.DataFrame(foreign.groupby(by = "dates")['episodes_daily'].sum())
foreign.rename(columns = {'episodes_daily' : "foreign_episodes_daily"},
               inplace = True)
# Domestic
domestic = df[df['country'] == 'ru']
domestic = pd.DataFrame(domestic.groupby(by = "dates")['episodes_daily'].sum())
domestic.rename(columns = {'episodes_daily' : "domestic_episodes_daily"},
                inplace = True)

# Foreign and Putin
foreign_putin = df[df['country'] != 'ru']
foreign_putin = pd.DataFrame(foreign_putin.groupby(by = "dates")['putin'].sum())
foreign_putin.rename(columns = {'putin' : "foreign_putin_daily"},
                     inplace = True)
# Domestic and Putin
domestic_putin = df[df['country'] == 'ru']
domestic_putin = pd.DataFrame(domestic_putin.groupby(by = "dates")['putin'].sum())
domestic_putin.rename(columns = {'putin' : "domestic_putin_daily"},
                      inplace = True)

# This dataframe will be used in the tests and figures below
dat = pd.concat([all_days,
                 counts_all,
                 putin,
                 foreign,
                 domestic,
                 foreign_putin,
                 domestic_putin],
                axis = 1)

dat['putin_daily_share'] = dat['putin_episodes_daily'] / dat['episodes_daily']
dat['putin_foreign_share'] = dat['foreign_putin_daily'] / dat['foreign_episodes_daily']
dat['putin_domestic_share'] = dat['domestic_putin_daily'] / dat['domestic_episodes_daily']
dat['share_of_foreign_in_putin'] =  dat['foreign_putin_daily'] / dat['putin_episodes_daily']
dat = dat.fillna(0)

In [None]:
# Figure 2. Sentiment Estimates

# Extract and set the 'dates' and 'sentiment_score' columns from DataFrame df 
# and set 'dates' as index for dat1
dat1 = df[['dates', 'sentiment_score']]
dat1 = dat1.set_index('dates')

# Filter DataFrame df for rows where 'putin' equals 1,
# extract 'dates' and 'sentiment_score' columns, and set 'dates' as index for dat2
dat2 = df[df['putin'] == 1]
dat2 = dat2[['dates', 'sentiment_score']]
dat2 = dat2.set_index('dates')

# Create subplots for two plots, set figure size and background color
fig, ax = plt.subplots(nrows = 2, ncols = 1, figsize = (15, 6), facecolor = 'white')

# Plot rolling mean of 'sentiment_score' for dat1 and dat2 on ax[0] 
ax[0].plot(dat1['sentiment_score'].rolling(window = 50).mean(), color = 'black', linewidth = 2)
ax[0].plot(dat2['sentiment_score'].rolling(window = 50).mean(), color = 'darkgrey', linewidth = 2)

# Format x-axis ticks and labels for ax[0]
ax[0].xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
ax[0].tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax[0].tick_params(axis = 'y', labelsize = 16)

# Set title and ylabel for ax[0]
ax[0].set_title('Sentiment estimates of news stories (Rubert-tiny)', fontsize = 20)
ax[0].set_ylabel('A moving average\n (n=50)', fontsize = 16)
ax[0].set_xticklabels([])  # Remove x-axis tick labels
ax[0].yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.2f}'))  # Format y-axis labels

# Plot rolling mean of 'sentiment_score' 
ax[1].plot(dat1['sentiment_score'].rolling(window = 1000).mean(), color = 'black', linewidth = 2)
ax[1].plot(dat2['sentiment_score'].rolling(window = 1000).mean(), color = 'darkgrey', linewidth = 2)

# Format x-axis ticks and labels for ax[1]
ax[1].xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
ax[1].tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax[1].tick_params(axis = 'y', labelsize = 16)

# Set ylabel for ax[1]
ax[1].set_ylabel('A moving average\n (n=1,000)', fontsize = 16)

# Add vertical dashed lines for election dates on ax[1]
for i in election_dates:
    ax[1].axvline(x = i, color = 'black', linestyle = "dashed", linewidth = 1)
# Format y-axis labels
ax[1].yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.2f}'))  

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Legend for Figure 2

# Create a new figure with white background for the legend
plt.figure(facecolor = 'white')

# Plot lines with specific colors and labels for the legend
plt.plot([0, 0], color = 'black', label = "All stories", linewidth = 2)
plt.plot([0, 0], color = 'darkgrey', label = "Stories that involve Putin", linewidth = 2)
plt.plot([0, 0], color = 'black', linestyle = "dashed", label = 'New presidential term', linewidth = 2)

# Add legend with specified properties and position
plt.legend(prop={'size' : 15}, bbox_to_anchor=(2, 2))

# Adjust layout and display the legend
plt.tight_layout()
plt.show()

In [None]:
# Histogram at the bottom of Figure 2

# Filter DataFrame df for rows where 'putin' equals 1
df1 = df[df['putin'] == 1]

# Numbers to be used in the title of the histogram in Figure 2

print(df.shape[0])
print(df1.shape[0]) 

# Create a figure with 1 row and 2 columns for the histograms
fig, axs = plt.subplots(1, 2, figsize = (7, 2))

# Plot histograms for 'sentiment_score' in both DataFrames
axs[0].hist(df['sentiment_score'], bins = 100, color = 'black', label = 'Histogram 1')
axs[1].hist(df1['sentiment_score'], bins = 100, color = 'gray', label = 'Histogram 2')

# Set titles and labels for each subplot
axs[0].set_title('All stories (n=385,981)')
axs[1].set_title('Stories that involve Vladimir Putin (n=45,769)')
axs[0].set_ylabel('Frequency')
# Hide y-axis labels for the second histogram
axs[1].tick_params(axis = 'y', labelleft = False) 
axs[0].get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ',')))  
axs[0].set_xlabel('Sentiment score')
axs[1].set_xlabel('Sentiment score')

# Adjust layout and display the histograms
plt.tight_layout()
plt.show()

In [None]:
# Text in the article:

# " Overall, 23% of the stories in the corpus were labeled as negative, 
#   52% as neutral, and 25% as positive. The related shares for the stories 
#   that involve Putin are 9%, 56%, and 34%. "

print('Overall: \n')
print('Negative:', round(df[df['sentiment_labels'] == 'negative'].shape[0] / df.shape[0],2))
print('Neutral', round(df[df['sentiment_labels'] == 'neutral'].shape[0] / df.shape[0],2))
print('Positive', round(df[df['sentiment_labels']== 'positive'].shape[0] / df.shape[0],2), '\n')

print('Involve Putin:')

dfp = df[df['putin']==1] # subset to only stories that involve 'Putin'
print('Negative:', round(dfp[dfp['sentiment_labels'] == 'negative'].shape[0] / dfp.shape[0],2))
print('Neutral', round(dfp[dfp['sentiment_labels'] == 'neutral'].shape[0] / dfp.shape[0],2))
print('Positive', round(dfp[dfp['sentiment_labels'] == 'positive'].shape[0] / dfp.shape[0],2), '\n')

In [None]:
# Figure 3. The daily share of news reports that mention Vladimir Putin

# Create a subplot with specific size and background color
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 2), facecolor = 'white')

# Add vertical dashed lines for election dates
for i in election_dates:
    ax.axvline(x = i, color = 'black', linestyle = "dashed", linewidth = 1)

# Set major formatters and locators for both axes and adjust tick parameters
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
ax.tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax.tick_params(axis = 'y', labelsize = 13)

# Set title for the plot
ax.set_title("The daily share of news reports that mention Vladimir Putin", fontsize = 17)

# Set additional y-axis formatter, adjust tick parameters, set ylabel and plot data
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
ax.set_ylabel('A moving average \n (n=50)', fontsize = 16)
ax.plot(dat['putin_daily_share'].rolling(window = 50).mean(), color = 'black', linewidth = 2)

# Annotate the plot with a text and an annotation box
plt.annotate("Dmitry Medvedev \n serves as the president,\n May 2008–May 2012",
             xy = (dat.index[5050], 0), xytext = (dat.index[3200], 0.18),
             size = 14, bbox = dict(boxstyle = "round", fc = 'white', color = 'white'))

# Add text with annotation box
props = dict(facecolor = 'white', edgecolor = 'white', alpha = 0.8)
ax.text(0.4, 0.7, 'Dmitry Medvedev \n serves as the president, \n May 2008—May 2012',
        ha = 'left', va = 'center', transform = ax.transAxes, fontsize = 14, bbox = props)

# Display the plot
plt.show()

In [None]:
# Table 2. Daily references to Putin

# Assign the index values of the DataFrame to the 'dates' column
dat['dates'] = dat.index.values

# Initialize the 'term' column with a value of 1
dat['term'] = 1

# Update 'term' to 2 where the 'date' is greater than a specified value (el2)
dat.loc[dat['dates'] > el2, 'term'] = 2

# Update 'term' to 'Medvedev' where the 'date' is greater than a specified value (el3)
dat.loc[dat['dates'] > el3, 'term'] = 'Medvedev'

# Updating 'term' to 3 where the 'date' is greater than a specified value (el4)
dat.loc[dat['dates'] > el4, 'term'] = 3

# Updating 'term' to 4 where the 'date' is greater than a specified value (el5)
dat.loc[dat['dates'] > el5, 'term'] = 4

# Model 1
print ('\n =============================== Model 1 =============================== \n')
result = sm.ols(formula = "putin_daily_share ~ term", data = dat).fit(cov_type = 'HC3') 
print(result.params)
print(result.summary())

# Model 2
print ('\n =============================== Model 2 =============================== \n')
result = sm.ols(formula="putin_daily_share ~ term + foreign_episodes_daily", data = dat).fit(cov_type = 'HC3') 
print(result.params)
print(result.summary())

# ADF test
print ('\n =============================== ADF test =============================== \n')
result = adfuller(dat['putin_daily_share'])
adf_statistic = result[0]
p_value = result[1]
critical_values = result[4]

# Displaying the results
print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print('Critical Values:')
for key, value in critical_values.items():
    print(f'   {key}: {value}')

In [None]:
# Text in the article (subsection 'H2 (Frequency)'):

# " On average, Vladimir Putin was mentioned in 13% of the news episodes during his
#   first presidential term (2000–2004),
#   10% during the second (2004–2008),
#   6% during his premiership (2008–2012),
#   13% during the third (2012–2018)
#   and in 20% during the period between
#   the beginning of his fourth presidential term (2018) and February 23, 2022 "

dfp = df[df['putin']==1] # only stories that mention Putin

def proportion(term):
    """
    Calculate the proportion of rows for a given term in dfp (only stories that mention Putin)
    relative to df (all stories).
    """
    # Calculate the number of rows in dfp for the given term
    dfp_term_count = dfp[dfp['term'] == term].shape[0]
    
    # Calculate the number of rows in df for the given term
    df_term_count = df[df['term'] == term].shape[0]
    
    # Calculate the proportion and round it to 3 decimal places
    proportion = round(dfp_term_count / df_term_count, 2)
    
    return proportion

print('First presidential term:', proportion(1))
print('Second presidential term:', proportion(2))
print('Premiership:', proportion('Medvedev'))
print('Third presidential term:', proportion(1))
print('The period between the beginning of his fourth presidential term (2018) and February 23, 2022:', proportion(4))

In [None]:
# Text in the article (subsection 'H2 (Frequency)'):
# Footnote: 

# " The stories mentioning Putin are, on average, 50.31% longer than all the other stories "
print('The difference is',
      round((dfp['textlength'].mean()- df['textlength'].mean())/df['textlength'].mean() * 100,2),
      '%\n')

# "(the difference is statistically significant at 0.01 level) "
result = sm.ols(formula = "textlength ~ putin", data = df).fit(cov_type='HC3') 
print(result.params)
print(result.summary())

In [None]:
# Text in the article (subsection H3 (Country-topics):

# " The ruler was covered by 3% of the news about foreign affairs and events
#   abroad in 2012, 5% in 2013, 8% in 2014, 9% in 2015, 10% in 2016,
#   12% in 2017, 14% in 2018, 2019, and 2020, 18% in 2021, and 24% in the first two months of 2022. "

def share(year):
    """
    Calculate the proportion of foreign stories that involve Putin in a given year
    """
    # Count of foreign stories that involve Putin in a given year
    fp = df[(df['year'] == year) & (df['country'] != 'ru') & (df['putin'] == 1)].shape[0]
    
    # Count of foreign stories in a given year
    f = df[(df['year'] == year) & (df['country'] != 'ru')].shape[0]
    
    share = round(fp / f * 100,)
   
    return share

# Print the shares for each year from 2012 to 2022
for year in list(range(2012, 2023)):
    print(year, ':', share(year), '%')

In [None]:
# Create a subplot with specific size and background color for Figure 4

fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 3), facecolor = 'white')

# Plot rolling averages of shares of domestic and foreign news reports mentioning Putin
ax.plot(dat['putin_domestic_share'].rolling(window = 50).mean(), color = 'black', linewidth = 2)
ax.plot(dat['putin_foreign_share'].rolling(window = 50).mean(), color = 'gray', linewidth = 2)

# Add vertical dashed lines for election dates
for i in election_dates:
    ax.axvline(x = i, color = 'black', linestyle = "dashed", linewidth = 1)

# Set major formatters, locators, and adjust tick parameters for both axes
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
ax.tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax.tick_params(axis = 'y', labelsize = 13)

# Set additional y-axis formatter and adjust tick parameters
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

# Set title for the plot and add annotation
ax.set_title('Daily shares of all domestic and foreign news stories that mention Vladimir Putin', fontsize = 18)
ax.annotate("Dmitry Medvedev \n serves as the president,\n May 2008–May 2012",
             xy = (dat.index[5050], 0), xytext = (dat.index[3200], 0.18),
             size = 14, bbox = dict(boxstyle = "round", fc = 'white', color = 'white'))

# Set ylabel and adjust layout
ax.set_ylabel('A moving average\n (n=50)', fontsize = 16)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Legend for Figure 4

# Create a new figure with white background for the legend
plt.figure(facecolor = 'white')

# Plot lines with specific colors and labels for the legend
plt.plot([0, 0], color = 'black', label = "Domestic", linewidth = 7)
plt.plot([0, 0], color = 'gray', label = "Foreign", linewidth = 7)

# Add legend with specified properties and position
plt.legend(prop = {'size': 35}, bbox_to_anchor = (2, 2))

# Adjust layout and display the legend
plt.tight_layout()
plt.show()

In [None]:
# Figure 5: The share of foreign news in all the stories that mention Vladimir Putin

# Create a subplot for Figure 5 with specified size and background color
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 3), facecolor = 'white')

# Plot rolling average of share of foreign news in stories mentioning Putin
ax.plot(dat['share_of_foreign_in_putin'].rolling(window = 50).mean(), color = 'black', linewidth = 2)

# Add annotation for the period when Dmitry Medvedev served as president
plt.annotate("Dmitry Medvedev \n serves as the president,\n May 2008–May 2012",
             xy = (dat.index[5000], 0.45), xytext = (dat.index[3200], 0.38),
             size = 14, bbox = dict(boxstyle = "round", fc = 'white', color = 'white'))

# Add vertical dashed lines for election dates
for i in election_dates:
    ax.axvline(x = i, color = 'black', linestyle = "dashed", linewidth = 1)

# Set major formatters, locators, and adjust tick parameters for both axes
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth = (0, 12)))
ax.tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax.tick_params(axis = 'y', labelsize = 13)

# Set additional y-axis formatter and adjust tick parameters
ax.yaxis.set_major_formatter(PercentFormatter(2))
ax.tick_params(axis = 'x', rotation = 45, labelsize = 16)
ax.tick_params(axis = 'y', labelsize = 16)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

# Set title, ylabel, and display the plot
ax.set_title('The share of foreign news in all the stories that mention Vladimir Putin', fontsize = 20)
ax.set_ylabel('A moving average\n (n=50)', fontsize = 16)
plt.show()

In [None]:
# Text in the article (subsection H3 (Country-topics):

# "However, importantly, every year from 2000 to the first two months of 2022,
#  Vladimir Putin has been more frequently mentioned in reports that cover domestic
#  events than in stories about foreign affairs and news originating abroad.


def more_frequently (year):
    """
    Check if Putin is more frequently mentioned in reports that cover domestic events
    """
    # Count of foreign stories that involve Putin in a given year
    fp = df[(df['year'] == year) & (df['country'] != 'ru') & (df['putin'] == 1)].shape[0]
    
    # Count of domestic stories that involve Putin in a given year
    dp = df[(df['year'] == year) & (df['country'] == 'ru') & (df['putin'] == 1)].shape[0]
   
    return dp > fp

# Check if Putin is mentioned more frequently in reports that cover domestic events
for year in list(range(2000, 2023)):
    print(year, ':', more_frequently(year))

In [None]:
# Text in the article (subsection H3 (Country-topics):

# " On average, news about foreign affairs and events amounted to 28% of all 
#   reports that covered the ruler during his acting presidency and the first
#   term in power (2000–2004), 24% during his second presidential term (2004–2008),
#   13% during his premiership (2008–2012),
#   35% during his third presidential term (2012–2018),
#   and 27% during the period between the beginning of
#   his fourth presidential term (2018)
#   and February 23, 2022. " 


def prop_foreign (term):
    """
    Calculate the proportion of foreign news in all reports that covered Putin
    """
    # Stories that mentioned Putin in a given term
    p = df[(df['term'] == term) & (df['putin'] == 1)].shape[0]
    # Foreign stories that mentioned Putin in a given term
    fp = df[(df['term'] == term) & (df['country'] != 'ru') & (df['putin'] == 1)].shape[0]
    prop = round(fp/p*100, )
    
    return prop

for term in list(set(df['term'])):
    print('Term', term, prop_foreign(term), '%')

In [None]:
# Text in the article (subsection H3 (Country-topics):
# " The coverage was primarily focused on Ukraine and the United States. "

# Find most-popular country-topics
p = df[(df['country'] != 'ru') & (df['putin'] == 1)]['country']
# Select the top 5
print(p.value_counts().sort_values(ascending=False).head(5))

In [None]:
# Data for Figrue 6

# Mapping the county-labeles from Newsmap to country-labeles from Plotly
# Create a list with the labels from Newsmap
newsmap_code = ['ae', 'af', 'am', 'ar', 'at', 'au', 'az', 'be', 'bg', 'br', 'by', 'ca',
                'ch', 'cn', 'co', 'cu', 'cy', 'cz', 'de', 'dz', 'ec', 'eg', 'es', 'fi',
                'fr', 'gb', 'ge', 'gr', 'hu', 'id', 'il', 'iq', 'ir', 'it', 'jp', 'kg',
                'kp', 'kz', 'lt', 'lv', 'md', 'ml', 'mt', 'mx', 'ng', 'nl', 'nz', 'ph',
                'pk', 'pl', 'qa', 'rs', 'sa', 'sg', 'sk', 'ss', 'sy', 'tj', 'tm', 'tr',
                'ua', 'us', 'uz', 've', 'vn', 'za', 'al', 'ba', 'bh', 'bo', 'cl', 'ee',
                'gn', 'gq', 'hr', 'in', 'is', 'jo', 'lb', 'lu', 'ly', 'ma', 'mc', 'me',
                'mn', 'my', 'mz', 'no', 'np', 'pt', 'py', 'ro', 'se', 'si', 'sn', 'tg',
                'th', 'ug', 'uy']
# Create a list with analogous labels from Plotly
plotly_code = ['ARE', 'AFG', 'ARM', 'ARG', 'AUT', 'AUS', 'AZE', 'BEL', 'BGR', 'BRA',
               'BLR', 'CAN', 'CHE', 'CHN', 'COL', 'CUB', 'CYP', 'CZE', 'DEU', 'DZA',
               'ECU', 'EGY', 'ESP', 'FIN', 'FRA', 'GBR', 'GEO', 'GRC', 'HUN', 'IDN',
               'ISR', 'IRQ', 'IRN', 'ITA', 'JPN', 'KGZ', 'KOR', 'KAZ', 'LTU', 'LVA',
               'MDA', 'MLI', 'MLT', 'MEX', 'NGA', 'NLD', 'NZL', 'PHL', 'PAK', 'POL',
               'QAT', 'SRB', 'SAU', 'SGP', 'SVK', 'SSD', 'SYR', 'TJK', 'TKM', 'TUR',
               'UKR', 'USA', 'UZB', 'VEN', 'VNM', 'ZAF', 'ALB', 'BIH', 'BHR', 'BOL',
               'CHL', 'EST', 'GIN', 'GNQ', 'HRV', 'IND', 'ISL', 'JOR', 'LBN', 'LUX',
               'LBY', 'MAR', 'MCO', 'MNE', 'MNG', 'MYS', 'MOZ', 'NOR', 'NPL', 'PRT',
               'PRY', 'ROU', 'SEN', 'SVN', 'SEN', 'TGO', 'THA', 'UGA', 'URY']

# Filter dataframe to select rows where the country is not 'ru' and 'putin' column is 1
df1 = df[df['country'] != 'ru']
df1 = df1[df1['putin'] == 1]

# Initialize an empty list to store counts
cnt = []

# Iterate over newsmap_code
for i in newsmap_code:
    # Subset df1 for each country code and count the number of rows
    df2 = df1[df1['country'] == i]
    cnt.append(df2.shape[0])

# Create a dictionary with data for the plotly map
data = {'newsmap_code': newsmap_code,
        'plotly_code': plotly_code,
        'cnt': cnt}

# Create a DataFrame using the dictionary
plotlymap = pd.DataFrame(data)

In [None]:
# N of episodes - for the legend
print(df1.shape[0])

# Define a grey colorscale for the map
greys = [
    [0, '#f7f7f7'], [0.1, '#eeeeee'], [0.2, '#e5e5e5'], [0.3, '#d9d9d9'],
    [0.4, '#cccccc'], [0.5, '#b0b0b0'], [0.6, '#999999'], [0.7, '#7d7d7d'],
    [0.8, '#666666'], [0.9, '#4d4d4d'], [1.0, '#333333']
]

# Create a choropleth map using Plotly
fig = go.Figure(data = go.Choropleth(
    locations = plotlymap['plotly_code'],
    z = plotlymap['cnt'],
    colorscale = greys,  # Use the grey colorscale
    autocolorscale = False,
    marker_line_color = 'black',
    marker_line_width = 0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'Episodes (n = 12,786)',
    colorbar = dict(
        tickformat = ',',  # Adds commas as thousand separators
    )
))

# Customize layout settings for the map
fig.update_layout(
    font_family = "Times New Roman",
    font_color = "black",
    title_font_family = "Times New Roman",
    title_font_color = "black",
    legend_title_font_color = "green",
    title = {
        'text': "News reports about foreign affairs and foreign events that mention Vladimir Putin",
        'x': 0.5,  # Center the title horizontally
        'xanchor': 'center'  # Center the title horizontally
    }
)

# Display the choropleth map
fig.show()

In [None]:
# Appendix A1. Example of a news transcript

# Create a datetime object for comparison
print(list(df[df['dates'] == date(2019, 3, 12)]['text'])[6])

In [None]:
# Appendix A3. Labelled news reports: positive, neutral, negative

x = df[(df['dates'] == date(2019, 3, 13)) & (df['sentiment_labels'] == 'positive')]
print('Positive\n', list(x['text'])[7])

x = df[(df['dates'] == date(2019, 3, 12)) & (df['sentiment_labels'] == 'neutral')]
print('Neutral\n', list(x['text'])[3])

x = df[(df['dates'] == date(2019, 1, 10)) & (df['sentiment_labels'] == 'negative')]
print('Negative\n', list(x['text'])[0])

In [None]:
# Appendix A4. Newsmap

# Text in the Appendix: 
# " Based on the classification results, the most popular countries of news event
#   origin in the dataset are Russia (55%), the United States (10%), Ukraine (7%),
#   France (2%), the United Kingdom (2%), Germany (2%), Syria (2%), Georgia (1%),
#   Japan (1%), and Turkey (1%) "

# Get the top 9 countries by count and calculate their share
top = df['country'].value_counts().nlargest(10).reset_index()

# Rename columns
top.columns = ['country', 'count']

# Calculate the share
top['share'] = round(top['count'] / df.shape[0], 2)
print(top)

In [None]:
# A5. Human-based validation

# Krippendorff's alpha for the accuracy of Rubert Tiny 

def label_coder_data(df):
    """
    Create a new DataFrame with labeled coder data.

    Args:
    df (pd.DataFrame): The original DataFrame containing the coder data.
    Returns:
    pd.DataFrame: A new DataFrame with labeled coder data.
    """
    data = {
        'coder1': df['coder1'].astype(int),
        'coder2': df['coder2'].astype(int)
    }
    labeled_df = pd.DataFrame(data)
    return labeled_df

# Label the sample for accuracy
rd = label_coder_data(sa)
# Prepare the reliability data for Krippendorff's alpha
reliability_data = [rd['coder1'].tolist(), rd['coder2'].tolist()]
# Calculate Krippendorff's alpha
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha for the accuracy: {alpha}")

# Krippendorff's alpha for the precision of Rubert Tiny 
# Label the sample for the precision of Rubert Tiny 
rd = label_coder_data(pr)
# Prepare the reliability data for Krippendorff's alpha
reliability_data = [rd['coder1'].tolist(), rd['coder2'].tolist()]

# Calculate Krippendorff's alpha for precision of Rubert Tiny
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha for precision of Rubert Tiny : {alpha}")

# Calculate Krippendorff's alpha for the Precision of Newsmap
# Label the sample for for the Precision of Newsmap
rd = label_coder_data(pn)
# Prepare the reliability data for Krippendorff's alpha
reliability_data = [rd['coder1'].tolist(), rd['coder2'].tolist()]

# Calculate Krippendorff's alpha for the Precision of Newsmap
alpha = round(krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 2)
print(f"Krippendorff's alpha for precision Newsmap: {alpha}")

In [None]:
# Appendix A6. Most frequent words in the sentences that refer to Vladimir Putin

# Filter dataframe for rows where 'putin' is 1 and select relevant columns
df1 = df[df['putin'] == 1][['dates', 'text', 'putin']]

# Extract sentences containing 'Путин'
sentences = [''.join([sentence + '.' for sentence in text.split('.') if 'Путин' in sentence]) for text in df1['text']]

# Split sentences into words
allwords = list(itertools.chain.from_iterable([sentence.split() for sentence in sentences]))

# Lowercase and remove punctuation
cleaned_words = [word.lower().strip('.,:;«»"') for word in allwords]

# Remove stopwords and specified words
stopwords = set(stopwords)
# Additional context-specific stopwords and declenatinos of name 'Vladimir Putin'
todrop = ['', '"мы', '-', '1', '10', '2', '20', 'в', 'владимир', 'владимира',
          'владимиром', 'владимиру', 'вместе', 'вообще', 'время', 'всe', 'все,',
          'всего,', 'всем', 'всё', 'второй', 'выпуске', 'год', 'года', 'году',
          'государства', 'двух', 'действительно', 'день', 'дня', 'другие',
          'других', 'ещe', 'ещё', 'затем', 'и', 'и,', 'из-за', 'именно',
          'какие', 'канал', 'касается', 'конечно,', 'которая', 'которое',
          'которой', 'которые', 'который', 'которых', 'кремле', 'кремля',
          'кроме', 'лет', 'лишь', 'медведев', 'многие', 'москве', 'назад', 'накануне',
          'нам', 'наша', 'нашей', 'наши', 'наших', 'несколько', 'но',
          'однако', 'однако,', 'одной', 'орт', 'очень', 'первый', 'пока',
          'поэтому', 'прежде', 'президент', 'президента', 'президентом',
          'президенту', 'премьер', 'премьер-министр', 'премьер-министра',
          'премьер-министром', 'премьера', 'просто', 'против', 'путин',
          'путин,', 'путина', 'путина,', 'путину', 'путиным', 'путиным.',
          'россией', 'россии', 'россии,', 'российские', 'российских',
          'российского', 'российской', 'россию', 'россия', 'рф', 'свое',
          'своего', 'своей', 'своем', 'свои', 'своим', 'своими', 'свой',
          'сегодня', 'смотрите', 'сразу', 'среди', 'так,', 'также',
          'такие', 'таким', 'тем', 'тех', 'то,', 'того,', 'том,', 'ходе',
          'хотя', 'целом', 'числе', 'что', 'эта', 'этим', 'этих', 'это',
          'этому', 'я', '–', '—']

# Drop stopwords and specified words
filtered_words = [word for word in cleaned_words if word not in stopwords and word not in todrop]

# Create DataFrame from filtered words and count occurrences
word_df = pd.DataFrame(filtered_words, columns=['word'])
word_counts = word_df['word'].value_counts().head(50).reset_index()
word_counts.columns = ['word', 'count']

# Display the top 50 words
topwords = list(word_counts['word'])
print(topwords)

In [None]:
# Appendix A7. Domestic stories that involve Vladimir Putin: most frequent words

# Filter dataframe for rows where 'putin' is 1 and select relevant columns
df1 = df[(df['putin'] == 1)&(df['country'] == 'ru')][['dates', 'text', 'putin']]

# Extract sentences containing 'Путин'
sentences = [''.join([sentence + '.' for sentence in text.split('.') if 'Путин' in sentence]) for text in df1['text']]

# Split sentences into words
allwords = list(itertools.chain.from_iterable([sentence.split() for sentence in sentences]))

# Lowercase and remove punctuation
cleaned_words = [word.lower().strip('.,:;«»"') for word in allwords]

# Drop stopwords and words from A6
filtered_words = [word for word in cleaned_words if (word not in stopwords) and (word not in todrop) and (word not in topwords)]

# Create DataFrame from filtered words and count occurrences
word_df = pd.DataFrame(filtered_words, columns=['word'])
word_counts = word_df['word'].value_counts().head(50).reset_index()
word_counts.columns = ['word', 'count']

# Display the top 50 words
topwrodsdomestic = list(word_counts['word'])[:50]
print(topwrodsdomestic)

In [None]:
# Appendix A8. Dictionaries

# (Execution time: approximately 9 minutes)

# Dictionary for 'Economics and business'
def econ(text):
    econ = [' налог', 'безработ', 'бизнес', 'бирж', 'валют',
            'газпром', 'занятост', 'импорт', 'инвест', 'инфляц',
            'капитал', 'миллиард', 'млрд', 'нефт', 'рефинансировани',
            'санкции', 'тариф', 'торги', 'финанс', 'фонд', 'экономи', 'экспорт']
    a = 0
    for i in econ:
        if i in text.lower():
            a = 1
    return a

# Dictionary for 'National security and Russian army'
def army(text):
    army = [' танк ', 'армей', 'арми', 'артиллери', 'беспилотник',
            'бомбардировщ', 'ввс', 'вдв ', 'ветеран', 'воен', 'воин', 'впк',
            'генштаб', 'дивизи', 'зенитн', 'истребител', 'минобороны',
            'офицер', 'парад', 'пво', 'полигон', 'рвсн', 'солдат']
    a = 0
    for i in army:
        if i in text.lower():
            a = 1
    return a

# Dictionary for 'Social welfare, healthcare, and education'
def social(text):
    social = ['больниц', 'вакцин', 'врач', 'егэ', 'жкх', 'заболеван', 'здравоохран',
              'лекарств', 'медик', 'медиц', 'образовани', 'пациент', 'педагог','пенси',
              'поликлин', 'препарат', 'социальн', 'стипенд', 'студент', 'учител']
    a = 0
    for i in social:
        if i in text.lower():
            a = 1
    return a

# Dictionary for 'Cultural, religious, and sporting events' 
def cerem(text):
    cerem = [' кино', 'богослужен', 'искусств', 'концерт',
             'кубок', 'культур', 'литератур', 'матч', 'музе', 'награжд',
             'олимпи', 'парад ', 'православ', 'праздн', 'режиссер', 'сборная',
             'соревнован', 'спортив', 'театр', 'турнир',
             'феставал', 'фильм', 'худож',
             'церемон', 'чемпион', 'шоу', 'юбиле']
    a = 0
    for i in cerem:
        if i in text.lower():
            a = 1
    return a

# Create columns that equal 1 if the text includes words from the dictionary

df['econ'] = [econ(i) for i in df['text']]
df['army'] = [army(i) for i in df['text']]
df['social'] = [social(i) for i in df['text']]
df['cerem'] = [cerem(i) for i in df['text']]

# Function to change values in a column to 1 if they are higher than 0
def change_value(x):
    return 1 if x > 0 else 0

# Create category 'covered' that equals 1 if the text includes words from any dictionary

df['covered'] = df['econ'] + df['army'] + df['social'] + df['cerem']
# Apply the function to the column
df['covered'] = df['covered'].apply(change_value)

In [None]:
# Appendix A8. Dictionaries
# Table in A8

# Sampling: randomly selecting stories to estimate the accuracy of the dictionaries

# Only domestic select stories that mention Putin
dfp = df[(df['putin'] == 1) & (df['country'] == 'ru')]

# This sample will be used in A8 in the appendix -- accuracy
sample = dfp.sample(n = 50, random_state = 42)
# Keep only necessary columns -- sample to estimate accuracy
sample = sample[['dates', 'text', 'econ', 'army', 'social', 'cerem', 'covered']]

# Display the first few rows
sample.head()

# The code below can be used to save the sample to .csv
# sample.to_csv("insert a path here")

# Check if the text is the same as in validation file
print("Same as validation file?", list(ad['text']) == list(sample['text']))

In [None]:
# Appendix A8. Dictionaries
# Table in A8

# Check if the validations results are the same as dictionary results (full accuracy)
# referred as '1. Accuracy (all dictionaries) that equals to 1'
# c1 are results form coder 1, c2 are from coder 2

conditions = [
    (ad['c1_econ'] == ad['c2_econ']) & (ad['c1_econ'] == ad['econ']),
    (ad['c1_army'] == ad['c2_army']) & (ad['c1_army'] == ad['army']),
    (ad['c1_social'] == ad['c2_social']) & (ad['c1_social'] == ad['social']),
    (ad['c1_cerem'] == ad['c2_cerem']) & (ad['c1_cerem'] == ad['cerem']),
    (ad['c1_covered'] == ad['c2_covered']) & (ad['c1_covered'] == ad['covered'])
]

# Combine all conditions
all_conditions = all([condition.all() for condition in conditions])

# Print the result
print("All conditions are True:", all_conditions)

In [None]:
# Appendix A8. Dictionaries
# Table in A8

# Randomly selecting stories to estimate the precision of the dictionaries

# This sample will be used in A8 in the appendix -- precision

# Function to sample entries based on a column filter
def sample_entries(df, column):
    filtered_df = df[df[column] == 1]  # Filter rows where column value is 1
    return filtered_df.sample(20, random_state = 42)  # Randomly sample n entries

# Sample entries for each category
econ_sample = sample_entries(dfp, 'econ')
army_sample = sample_entries(dfp, 'army')
social_sample = sample_entries(dfp, 'social')
cerem_sample = sample_entries(dfp, 'cerem')

# Stack the samples vertically
dct_prec = pd.concat([econ_sample, army_sample, social_sample, cerem_sample], ignore_index=True)
# Keep only necessary columns -- sample to estimate accuracy
sample = dct_prec[['dates', 'text', 'econ', 'army', 'social', 'cerem']]

# Display the first few rows
sample.head()

# Check if the text is the same as in validation file
print("Same as validation file?", list(pdi['text']) == list(sample['text']))

In [None]:
# Appendix A8. Dictionaries
# Table in A8

# Check if the validations results are the same as dictionary results (full accuracy)
# c1 are results form coder 1, c2 are from coder 2

conditions = [
    (pdi['c1_econ'] == pdi['c2_econ']) ,
    (pdi['c1_army'] == pdi['c2_army']) ,
    (pdi['c1_social'] == pdi['c2_social']) ,
    (pdi['c1_cerem'] == pdi['c2_cerem']) 
]

# Combine all conditions
all_conditions = all([condition.all() for condition in conditions])

# Print the result
print("All conditions are True:", all_conditions)

In [None]:
# Appendix A8. Dictionaries
# Table in A8

# Define a list of dictionaries to iterate through
dictionaries = ['econ', 'army', 'social', 'cerem']

# Calculate and print precision for each dictionary
for dictionary in dictionaries:
    subset = pdi[pdi['dictionary'] == dictionary]
    precision = (subset[dictionary] == subset[f'c1_{dictionary}']).mean()
    print(f"Precision {dictionary}, {precision:.2f}")

In [None]:
# Text in the article (section 'Subjects of the Stories')

# " Although the themes frequently overlapped and certain stories did not cover any of the themes,
#   this approach helped me label 84% of domestic stories that mention the ruler "

# Filtering the DataFrame
dfp = df[(df['putin'] == 1) & (df['country'] == 'ru')]

# Calculating the required value
result = round(dfp['covered'].sum() / dfp.shape[0], 2)
print(result)

In [None]:
# Appendix A9. Counts of mentions of other politicians: co-opted parties and opposition
# (Execution time: approximately 5 minutes)

def count_name_occurrences(name):
    """
    Count the number of times a name appears in the specified column of the DataFrame.
    """
    return df['text'].apply(lambda x: 1 if name in x else 0).sum()

names = ['Путин', 'Абрамович', 'Авен', 'Аксенов', 'Аксёнов', 'Алекперов',
         'Багапш', 'Бастыркин', 'Батурин', 'Белавенцев',
         'Березовск', 'Бородае', 'Бородай', 'Бородаю', 'Бородая',
         'Бортников', 'Верзилов', 'Володин', 'Волошин', 'Герасимов',
         'Гиркин', 'Глазьев', 'Голодец', 'Греф', 'Грызлов', 'Гудков',
         'Дворкович', 'Делимханов', 'Дерипаск', 'Евтушенков', 'Жданов',
         'Жириновск', 'Золотов', 'Зубков', 'Зюганов', 'Илларионов',
         'Кадыров', 'Кара-Мурз', 'Каспаров', 'Касьянов',
         'Ковальчук', 'Кудрин', 'Лавров',
         'Лесин', 'Лимонов', 'Литвиненко', 'Лужков', 'Медведев',
         'Миллер', 'Навальн', 'Нарышкин', 'Немцов', 'Новодворск',
         'Патрушев', 'Песков', 'Полтавченко', 'Потанин', 'Примаков',
         'Прохоров', 'Ресин', 'Ройзман', 'Ролдугин', 'Ротенберг',
         'Сердюков', 'Сечин', 'Силуанов', 'Соболь', 'Собчак',
         'Собянин', 'Стрелков', 'Сурков', 'Тимченко',
         'Толоконников', 'Удальцов', 'Улюкаев', 'Устинов',
         'Фрадков', 'Фридман', 'Ходорковск', 'Чайка',
         'Чайке', 'Чайки', 'Чайкой', 'Чемезов', 'Черкесов', 
         'Чубайс', 'Чуров', 'Шевкунов', 'Шлосберг', 'Шойгу',
         'Шувалов', 'Эрнст', 'Якименко', 'Якунин', 'Ярмыш']

for name in names:
    print(name, ":", count_name_occurrences(name))

In [None]:
# Appendix A10. Themes (topics) in domestic news that refer to Vladimir Putin

# Subsetting to stories that only involve Putin
a10 = df[(df['putin'] == 1) & (df['country'] == 'ru')]

# Applying dictionaries to data from each term separately

print('Econ', 'Social', 'Security','Culture', 'Total')
for term in [1, 2, 'Medvedev', 3, 4]:
    dat = a10[a10['term'] == term]
    print(term, dat['econ'].sum(), dat['social'].sum(), dat['army'].sum(), dat['cerem'].sum(), dat.shape[0])

In [None]:
# The code below can be used to save the corpus into feather format
# this was required to make the file readable in R
# the newsmap_training.R uses corpus.feather as an imput file

# Leave only dates and text as 'country' will be assigned by Newsmap
df1 = df[['dates', 'text']]
# Drop 'Putin' from the text, as 'Putin' is not supposed to be a predictor of country-topic for the classifier
for i in range(len(df1)):
    df1.loc[i, 'text'] = df1.loc[i, 'text'].replace('Путин', '')

# Get the current working directory
current_directory = os.getcwd()

# Construct the path to the corpus.pkl file
corpus_feather_path = os.path.join(current_directory, "corpus.feather")
#print(corpus_feather_path)

# Save the file

df1.to_feather(corpus_feather_path)