In [None]:
import wikipediaapi as wp
import re
import os
import nltk
from nltk import stem
from nltk.corpus import stopwords
import string
import json
import time
import pandas as pd
import matplotlib.pyplot as plt

from src.etl.get_anames import retrieve_anames
from src.libcode import txt_to_list, list_to_txt

In [None]:
enwp = wp.Wikipedia("en")
anames = retrieve_anames()

In [None]:
pp_dir = "src/data/init/partisan_phrases/"
pp_txts = os.listdir(pp_dir)
score_dict = {}
for i in pp_txts:
    with open(pp_dir + i) as curtxt:
        for line in curtxt.readlines()[1:]:
            splt = line.split("|")
            score_dict[splt[0]] = float(splt[1].strip())

In [None]:
# num_per_text = len(anames)//10
# wikitxts_dir = "src/data/temp/wiki_txts/"

# # If wiki texts folder does not exist make it
# if not os.path.exists(wikitxts_dir):
#     os.makedirs(wikitxts_dir)

# txtlst = []
# for ind, aname in enumerate(anames):
#     # Get the page text
# #     print(ind, aname)
#     curpg = enwp.page(aname)
#     curtit = curpg.title
#     curtxt = curpg.text
#     txtlst.append(curtit)
#     txtlst.append(curtxt)
#     # This ensures it saves into 10 txt files
#     if (ind+1) % num_per_text == 0:
#         print(ind+1)
#         curtxt_name = "art_pages" + str((ind+1)//num_per_text) + ".txt"
#         list_to_txt(wikitxts_dir+curtxt_name, txtlst)
#         txtlst = []
#         time.sleep(10)
        
# # Save last set of articles
# if len(txtlst) > 0:
#     curtxt_name = "art_pages10" + ".txt"
#     list_to_txt(wikitxts_dir+curtxt_name, txtlst)

In [None]:
sorted(score_dict.items(), key=lambda item: item[1], reverse=False)

In [None]:
# nltk.download("wordnet")
# nltk.download("stopwords")

stpwrds = stopwords.words("english")
porter = stem.PorterStemmer()

def preproc_strn(strn):
    # Lowercase, remove digits and doublespaces
    curstr = strn.lower().translate(str.maketrans('', '', string.punctuation))
    curstr = re.sub(r'[0-9]+', '', curstr)
    curstr = re.sub(r'\n', ' ', curstr)
    curstr = re.sub(r'  +', ' ', curstr)
    plst = []
    for word in curstr.split():
        # Check for stopwords
        if word not in stpwrds:
            # Porter stem the word
            pword = porter.stem(word)
            plst.append(pword)
    numwords = len(plst)
    curstr = ' '.join(plst)
    return (curstr, numwords)

def string_score(strn, score_dict):
    # Pre-process, return the processed string and the number of words
    curstr, numwords = preproc_strn(strn)

    # Absolute bias sum
    absscore = 0
    # Bias sum
    sumscore = 0
    # Total number of occurences of phrases from G&S
    totphrs = 0
    
    # Dictionary of top 10 phrase counts
    counts_dict = {}
    
    for key, value in score_dict.items():
        
        numoccurs = curstr.count(key)
        totphrs += numoccurs
        counts_dict[key] = (numoccurs, value)
        curscore = numoccurs*value
        absscore += abs(curscore)
        sumscore += curscore

    counts_list = sorted(counts_dict.items(), key=lambda item: item[1], reverse=True)[:10]
    return [absscore, sumscore, numwords, counts_list, totphrs]

In [None]:
wikitxts_dir = "src/data/temp/wiki_txts/"
wiki_txts = ["art_pages" + str(i) + ".txt" for i in range(1,11)]
namestat_dict = {}

cnt = 0
for txt in wiki_txts:
    print(cnt)
    txtlst = txt_to_list(wikitxts_dir + txt)
    for item in txtlst:
        if cnt % 2 == 0:
            aname = item
        else:
            curres = string_score(item,score_dict)
            namestat_dict[aname] = curres
        cnt += 1

In [None]:
namestat_dict

In [None]:
for name, stat in namestat_dict.items():
    dispcnt = 1
    print(name + ":")
    procname = preproc_strn(name)[0]
    is_intitle = False
    for phr, freq in stat[3]:
        if phr in procname:
            is_intitle = True
        print(str(dispcnt) + ".    " + phr + " - " + str(freq))
        dispcnt += 1
        
    namestat_dict[name].append(is_intitle)
            
    print("__________________________")
    print()
    
# I still have to figure how to work with the frequencies the best. So far I only have a boolean for "title in most frequent 10 phrases".

In [None]:
namestat_df = pd.DataFrame.from_dict(namestat_dict,orient="index",columns=["absscore","sumscore","numwords","counts_list","totphrs","is_intitle"]).reset_index()

def make_columns(df):
    df["abs_by_num"] = df["absscore"] / df["numwords"]
    df["sum_by_num"] = df["sumscore"] / df["numwords"]
    df_reset = df.reset_index().rename({"level_0":"popularity"}, axis=1)
    abs_rank = df_reset.sort_values(by="absscore", ascending=False).reset_index()["level_0"]
    sum_rank = df_reset.sort_values(by="sumscore", ascending=False).reset_index()["level_0"]
    abn_rank = df_reset.sort_values(by="abs_by_num", ascending=False).reset_index()["level_0"]
    sbn_rank = df_reset.sort_values(by="sum_by_num", ascending=False).reset_index()["level_0"]
    df["abs_rank_diff"] = abs_rank - abn_rank
    df["sum_rank_diff"] = sum_rank - sbn_rank
    df["abs_rank"] = abs_rank
    df["sum_rank"] = sum_rank
    df["abn_rank"] = abn_rank
    df["sbn_rank"] = sbn_rank
    return df
    
    
    
    
    
nsdf = make_columns(namestat_df)

In [None]:
abs(nsdf["sumscore"]).corr(nsdf["numwords"])

In [None]:
nsdf["numwords"].corr(nsdf["absscore"])

In [None]:
nsdf["is_intitle"].mean()

In [None]:
pd.DataFrame.from_dict(score_dict, orient="index", columns = ["score"])["score"].hist(bins=1000)
plt.xlim([-200, 200])
plt.yscale("log")
plt.xlabel("Bias scores")
plt.savefig('bias_hist.png')

In [None]:
nsdf[["absscore","numwords"]].plot(kind="scatter",x="numwords",y="absscore")
plt.xlabel("Number of words")
plt.ylabel("Absolute score sum")
plt.savefig('absnum_scat.png')

In [None]:
nsdf["sumscoreabs"] = abs(nsdf["sumscore"])
nsdf[["sumscoreabs","numwords"]].plot(kind="scatter",x="numwords",y="sumscoreabs")
plt.xlabel("Number of words")
plt.ylabel("Score sum")
plt.savefig('sum_scat.png')