In [6]:
import re
import csv
import sys
import pandas as pd
import datetime
import pytz
from tqdm import tqdm
from langdetect import detect
from decouple import Config, RepositoryEnv

from src.file_utils import get_valid_filepaths, get_df, add_to_log, get_encoding, clean_table_cols

config = Config(RepositoryEnv('./.env'))
BASE_PATH = config.get('BASE_PATH')

csv.field_size_limit(sys.maxsize)

9223372036854775807

In [7]:
# altnews counts
def fill_altnews():  
    regex_lang = r"\/AlternativeMedia\/([^\/]*)"
    regex_subplat = r"fi_(enc_)?([^_]+)_"
    
    alt_news_path= "".join([BASE_PATH, "0_Full_Data_Classified/AlternativeMedia/"])
    all_files = get_valid_filepaths(alt_news_path)

    for filepath in tqdm(all_files, total=len(all_files)):
        subplatform = re.search(regex_subplat, filepath).group(2)
        df = get_df(filepath)
        clean_df = clean_table_cols(df)
        clean_df['lang'] = re.search(regex_lang, filepath).group(1)
        clean_df['subplatform'] = subplatform

        if "compact" in filepath:
            clean_df['formatted_title'] = clean_df['title'].str.lower().str.replace(' ', '-')
            clean_df['url'] = clean_df['date'] + '/' + df['formatted_title']
        
        red_df = clean_df[['date','url','lang','label_GR','v2_NWO']]
        red_df = red_df.dropna(subset=['date'])
        df_deduplicated = red_df.drop_duplicates(subset=['url'])
        grouped = df_deduplicated.groupby(['date', 'lang', 'label_GR','v2_NWO'])
        
        count_df = grouped.size().reset_index(name='counts')
        if count_df.shape[0] != 0:
            count_df["platform"] = "altnews"
            count_df.to_csv(f'./group_counts/altnews/{subplatform}.csv', index=False)  

In [8]:
fill_altnews()

100%|█████████████████████████| 14/14 [00:02<00:00,  4.77it/s]


In [9]:
# legnews counts

def fill_legnews():  
    def get_subplatform(row):
        if ('ArticleID') not in row or (pd.isna(row['ArticleID'])):
            return row['media']
        else: # if faz
            return 'faz'
        
    regex_lang = r"leg_media_([^_]*)"
    
    leg_news_path = "".join([BASE_PATH, "0_Full_Data_Classified/LegacyMedia/"])
    all_files = get_valid_filepaths(leg_news_path)


    for filepath in tqdm(all_files, total=len(all_files)):

        ids_seen = []
        
        language = "ger" if re.search(regex_lang, filepath).group(1) == "de" else "eng"
        first_row = pd.read_csv(filepath, nrows=2)
            
        chunksize = 10 ** 4
        enc = get_encoding(filepath)
        chunk_num = 1
        with pd.read_csv(filepath, chunksize=chunksize, encoding=enc, low_memory=False) as reader:
            for chunk in reader:

                # apply preprocessing to chunk
                chunk['id'] = chunk.id.replace(r'Dokument', '', regex=True)
                clean_chunk = clean_table_cols(chunk)
                clean_chunk['lang'] = language
                clean_chunk['subplatform'] = clean_chunk.apply(get_subplatform, axis=1)
                red_df = clean_chunk[['time','id','lang','label_GR','v2_NWO','subplatform']]
                red_df = red_df.dropna(subset=['time'])
                df_deduplicated = red_df.drop_duplicates(subset=['id'])
                df_deduplicated = df_deduplicated[~df_deduplicated['id'].isin(ids_seen)]
                
                grouped = df_deduplicated.groupby(['time', 'lang', 'label_GR','v2_NWO', 'subplatform'])
                count_df = grouped.size().reset_index(name='counts')
                count_df.rename(columns={'time': 'date'}, inplace=True)

                if count_df.shape[0] != 0:
                    count_df["platform"] = "legnews"
                    count_df.to_csv(f'./group_counts/legnews/{language}_chunk{chunk_num}.csv', index=False) 
                    chunk_num = chunk_num + 1

                ids_seen.append(df_deduplicated.id.tolist())

In [10]:
fill_legnews()

100%|███████████████████████████| 2/2 [00:08<00:00,  4.42s/it]


In [28]:
# 4chan counts
def fill_4chan():
    def transform_num_timestamp(row):
        berlin_tz = pytz.timezone('Europe/Berlin')
        berlin_time = datetime.datetime.fromtimestamp(row['timestamp'], berlin_tz)

        return berlin_time.strftime('%Y-%m-%d')
        
    filepath = "".join([BASE_PATH, "0_Full_Data_Classified/4chan/classified_fi_4chan_all_data_prepro.csv"])

    ids_seen = []
    
    chunksize = 10 ** 4
    enc = get_encoding(filepath)

    chunk_num = 1
    with pd.read_csv(filepath, chunksize=chunksize, encoding=enc, low_memory=False) as reader:
        for chunk in reader:

            # apply preprocessing to chunk
            clean_chunk = clean_table_cols(chunk)
            clean_chunk['lang'] = "eng"
            clean_chunk['id'] = clean_chunk['ï»¿thread_id'] + clean_chunk['doc_id'] + clean_chunk['num']
            red_df = clean_chunk[['timestamp','id','lang','label_GR','v2_NWO']]
            red_df['date'] = red_df.apply(transform_num_timestamp, axis=1)            
            red_df = red_df.dropna(subset=['date'])
            df_deduplicated = red_df.drop_duplicates(subset=['id'])
            df_deduplicated = df_deduplicated[~df_deduplicated['id'].isin(ids_seen)]
            
            grouped = df_deduplicated.groupby(['date', 'lang', 'label_GR','v2_NWO'])
            count_df = grouped.size().reset_index(name='counts')

            if count_df.shape[0] != 0:
                count_df["platform"] = "4chan"
                count_df.to_csv(f'./group_counts/4chan/pol_chunk{chunk_num}.csv', index=False) 
                chunk_num = chunk_num + 1

            ids_seen.append(df_deduplicated.id.tolist())

In [29]:
fill_4chan()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  red_df['date'] = red_df.apply(transform_num_timestamp, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  red_df['date'] = red_df.apply(transform_num_timestamp, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  red_df['date'] = red_df.apply(transform_num_timestamp, axis=1)
A value is tryin

ValueError: Invalid value NaN (not a number)

In [13]:
# reddit counts
def fill_reddit():         
    def create_reddit_url(row):
        return "".join(["reddit.com/r/", str(row['subreddit']), "/comments/", str(row['parent_id']), "/comment/", str(row['id'])])
    
    regex_lang = r"_reddit_([^_]*)"
    
    leg_news_path = "".join([BASE_PATH, "0_Full_Data_Classified/Reddit/"])
    all_files = get_valid_filepaths(leg_news_path)

    ids_seen = []

    chunk_num_ger = 0 
    chunk_num_eng = 0 
    
    for filepath in tqdm(all_files, total=len(all_files)):
        language = "ger" if re.search(regex_lang, filepath).group(1) == "de" else "eng"

        chunksize = 10 ** 4
        enc = get_encoding(filepath)

        try:
            with pd.read_csv(filepath, chunksize=chunksize, encoding=enc, low_memory=False, on_bad_lines="warn") as reader:
                for chunk in reader:
    
                    # apply preprocessing to chunk
                    clean_chunk = clean_table_cols(chunk)
                    clean_chunk['lang'] = language
                    clean_chunk['url'] = clean_chunk.apply(create_reddit_url, axis =1)
                    clean_chunk['date'] = clean_chunk['time_utc'].str.extract(r'(\d{4}-\d{2}-\d{2})')
                    red_df = clean_chunk[['date','url','lang','label_GR','v2_NWO']]
                    red_df = red_df.dropna(subset=['date'])
                    df_deduplicated = red_df.drop_duplicates(subset=['url'])
                    df_deduplicated = df_deduplicated[~df_deduplicated['url'].isin(ids_seen)]
                    
                    grouped = df_deduplicated.groupby(['date', 'lang', 'label_GR','v2_NWO'])
                    count_df = grouped.size().reset_index(name='counts')
                    
                    if count_df.shape[0] != 0:
                        count_df["platform"] = "reddit"
                        if language == "ger":
                            count_df.to_csv(f'./group_counts/reddit/{language}_{chunk_num_ger}.csv', index=False) 
                            chunk_num_ger = chunk_num_ger + 1
                        else:
                            count_df.to_csv(f'./group_counts/reddit/{language}_{chunk_num_eng}.csv', index=False) 
                            chunk_num_eng = chunk_num_eng + 1
            
                        ids_seen.append(df_deduplicated.url.tolist())
        except Exception as e:
            print(f"Error processing file {filepath}: {e}")

In [14]:
fill_reddit()

 29%|███████▋                   | 2/7 [00:34<01:40, 20.03s/it]

Error processing file /media/lks/My Book/0_Full_Data_Classified/Reddit/classified_fi_reddit_en_fi_condensed.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.



 71%|███████████████████▎       | 5/7 [01:09<00:25, 12.96s/it]

Error processing file /media/lks/My Book/0_Full_Data_Classified/Reddit/classified_fi_2_reddit_en_fi_condensed.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.



 86%|███████████████████████▏   | 6/7 [01:20<00:12, 12.02s/it]

Error processing file /media/lks/My Book/0_Full_Data_Classified/Reddit/classified_fi_3_reddit_en_fi_condensed.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.



100%|███████████████████████████| 7/7 [01:34<00:00, 13.49s/it]


In [15]:
# tweet counts
def fill_twitter():  
    def detect_lang(text):
        language_mapping = {"en": "eng","de": "ger"}
        lang_code = detect(text)
        try:
            return language_mapping[lang_code]
        except:
            return lang_code

    regex_lang = r"TwitterTweets\/([^\/]*)"
    
    tweets_path= "".join([BASE_PATH, "0_Full_Data_Classified/TwitterTweets/"])
    all_files = get_valid_filepaths(tweets_path)

    ids_seen = []

    chunk_num_ger = 0 
    chunk_num_eng = 0 
    
    for filepath in tqdm(all_files, total=len(all_files)):
        language = "ger" if re.search(regex_lang, filepath).group(1) == "ger" else "eng"

        chunksize = 10 ** 4
        enc = get_encoding(filepath)

        try:
            with pd.read_csv(filepath, chunksize=chunksize, encoding=enc, engine='python') as reader:
                for chunk in reader:
    
                    # apply preprocessing to chunk
                    clean_chunk = clean_table_cols(chunk)
                    clean_chunk['lang'] = language
                    clean_chunk['date'] = clean_chunk['time'].str.extract(r'(\d{4}-\d{2}-\d{2})')
                    red_df = clean_chunk[['date','id','lang','label_GR','v2_NWO']]
                    red_df = red_df.dropna(subset=['date'])
                    df_deduplicated = red_df.drop_duplicates(subset=['id'])
                    df_deduplicated = df_deduplicated[~df_deduplicated['id'].isin(ids_seen)]
                    
                    grouped = df_deduplicated.groupby(['date', 'lang', 'label_GR','v2_NWO'])
                    count_df = grouped.size().reset_index(name='counts')

                    if count_df.shape[0] != 0:
                        count_df["platform"] = "twitter"
                        if language == "ger":
                            count_df.to_csv(f'./group_counts/twitter/{language}_{chunk_num_ger}.csv', index=False) 
                            chunk_num_ger = chunk_num_ger + 1
                        else:
                            count_df.to_csv(f'./group_counts/twitter/{language}_{chunk_num_eng}.csv', index=False) 
                            chunk_num_eng = chunk_num_eng + 1
        
                        ids_seen.append(df_deduplicated.id.tolist())
        except Exception as e:
            print(f"Error processing file {filepath}: {e}")


In [16]:
fill_twitter()

100%|█████████████████████████| 72/72 [07:13<00:00,  6.02s/it]


100%|████████████████████| 4112/4112 [00:26<00:00, 154.81it/s]


In [35]:
def summarise_counts():
    tables_path = ("./group_counts/")
    all_files = get_valid_filepaths(tables_path)
    summary = pd.DataFrame()
    for filepath in tqdm(all_files, total = len(all_files)):
        temp_df = pd.read_csv(filepath)
        summary = pd.concat([summary, temp_df])
    result = summary.groupby(['date', 'lang', 'label_GR', 'v2_NWO', 'platform'], as_index=False).agg({'counts': 'sum'})
    result.to_csv("./group_counts/abs_counts.csv", index=False)

Unnamed: 0,date,lang,label_GR,v2_NWO,counts,platform,subplatform
0,2011-02-24,eng,0.0,0.0,242,twitter,
1,2011-02-24,eng,0.0,1.0,21,twitter,
2,2011-02-24,eng,1.0,0.0,12,twitter,
3,2011-02-25,eng,0.0,0.0,396,twitter,
4,2011-02-25,eng,0.0,1.0,38,twitter,
...,...,...,...,...,...,...,...
5109,2021-12-30,eng,1.0,0.0,20,4chan,
5110,2021-12-30,eng,1.0,1.0,10,4chan,
5111,2021-12-31,eng,0.0,0.0,1,4chan,
5112,2021-12-31,eng,1.0,0.0,10,4chan,


In [None]:
summarise_counts()

In [None]:
    print("Processing LIWC...")
    path_liwc = "".join([BASE_PATH, "/3_EN_culturepaper_LIWC/kilian_testweeks_r1full_r2call_en_fi_prep_new_liwc.csv"])
    liwc_chunk = 1
    
    try:
        for chunk in pd.read_csv(path_liwc, chunksize=10 ** 4):
            clean_chunk = clean_table_cols(chunk)
            clean_chunk['lang'] = clean_chunk['text'].apply(detect_lang)
            clean_chunk['date'] = clean_chunk['time'].str.extract(r'(\d{4}-\d{2}-\d{2})')
            clean_chunk['label_GR'] = None
            clean_chunk['v2_NWO'] = None
            red_df = clean_chunk[['date','id','lang','label_GR','v2_NWO']]
            red_df = red_df.dropna(subset=['date'])
            df_deduplicated = red_df.drop_duplicates(subset=['id'])
            df_deduplicated = df_deduplicated[~df_deduplicated['id'].isin(ids_seen)]
            
            grouped = df_deduplicated.groupby(['date', 'lang', 'label_GR','v2_NWO'])
            count_df = grouped.size().reset_index(name='counts')
            
            if count_df.shape[0] != 0:
                count_df["platform"] = "twitter"
                count_df.to_csv(f'./group_counts/twitter/liwc_{liwc_chunk}.csv', index=False) 
                liwc_chunk = liwc_chunk + 1

                ids_seen.append(df_deduplicated.id.tolist())
    except Exception as e:
        print(f"Error processing file {path_liwc}: {e}")