# Functions for primary data set manipulation

In [9]:
import pandas as pd
import numpy as np
import os

## Data loading and cleaning

In [10]:
def load_scraped_data(folder):
    """
    Loads all the files from the folder and
    concatenate them in one dataframe.
    
    Args: folder - string, a path of a folder with files
    Returns: df_scraped - dataframe
    """
    
    # Data loading
    
    # Get a list of files for loading
    files = [file for file in  os.listdir(folder) if file.endswith('_cleaned.csv')]
    
    # Load the first file
    df_scraped = pd.read_csv(folder+'/'+files[0], sep='|')
    # Add a key (like '0-15'), so we can identify which file each row comes from
    key = files[0][:files[0].rfind('_output')]
    df_scraped['file_key'] = key
    
    # Load the remaining files
    for file in files[1:]:
        df = pd.read_csv(folder+'/'+file, sep='|')
        key = file[:file.rfind('_output')]
        df['file_key'] = key
        
        df_scraped = pd.concat([df_scraped, df])
        
    # Give a meaningful name to the first column
    df_scraped.rename(columns={'Unnamed: 0': 'id_in_source_file'}, inplace=True)
    
    # Sort values by time descending
    df_scraped.sort_values(by='date', ascending=False, inplace=True)
    
    # Add a unique index
    df_scraped.reset_index(inplace=True, drop=True)
    
    # Replace np.Nan in the column "body" with empty string
    # otherwise tokenization can't be applied
    df_scraped.loc[df_scraped['body'].isnull(), 'body'] = ''
        
    return df_scraped     

In [24]:
def add_columns(df_scraped):
    """
    Performs the following manipulations with dataframe:
    - Adds columns useful for data grouping
    - Adds a level (whether a row represents an individual
      news story or a whole newscast)
    
    Args: df_scraped - a dataframe with scraped data
    Returns: df_scraped - the initial dataframe with added columns
    """  
    
    # Columns useful for data grouping
    
    # Create a column with a date-time in date-time format
    df_scraped['datetime'] = pd.to_datetime(df_scraped['date'])
    # Create a column with date 
    df_scraped['dat'] = df_scraped['datetime'].dt.date
    # Create a column with year
    df_scraped['year'] = df_scraped['datetime'].dt.year
    # Create a column with year_month
    df_scraped['year_month'] = df_scraped['datetime'].dt.to_period('M')
    # Create a column with hour
    df_scraped['hour'] = df_scraped['datetime'].dt.hour
    # Create a column with a day of week
    df_scraped['weekday'] = df_scraped['datetime'].dt.dayofweek
    
    
    # Level (whether a row represents an individual 
    
    # news story or a whole newscast) - new column 'whole newscast' (bool)
    df_scraped['whole newscast'] = df_scraped['title'].str \
        .contains("Выпуск программы «Время»|Выпуск программы «Воскресное Время»|Выпуск программы «Воскресное время»|Выпуск новостей")
    # Adding time of a newscast extracted from its title
    df_scraped['newscast title time'] = df_scraped['title'].str.extract('^[а-яА-Я«» ]*([0-9]*):[0-9][0-9].*')
    df_scraped['newscast title time1'] = df_scraped['title'].str.extract('^[а-яА-Я«» ]*([0-9]*) час от .*')
    df_scraped['newscast title time'] = np.where(df_scraped['newscast title time1'].isnull(),
                                                 df_scraped['newscast title time'],
                                                 df_scraped['newscast title time1'])
    df_scraped['newscast title time'] = pd.to_numeric(df_scraped['newscast title time'], 
                                                      errors='coerce').astype('Int64')
    # If there are no time in a title, it is an individual news story
    df_scraped.loc[(df_scraped['whole newscast'])&(df_scraped['newscast title time'].isnull()),
                   'whole newscast'] = False
    # We don't need the column 'newscast title time' anymore
    df_scraped.drop(columns=['newscast title time', 'newscast title time1'], inplace=True)
    
    return df_scraped  

In [14]:
def delete_duplicates(df_scraped):
    """
    Performs the following manipulations:
    - Deletes duplicates rows  
    - Deletes the first and the last days, because the loading
      for them is not full
    
    Args: df_scraped - a dataframe with scraped data
          This function uses added columns, so before 
          use the function "add_columns"
    Returns: df_scraped - the initial dataframe with deleted rows
    """
    # Because the loading for the first and last day is not complete, 
    # we must delete them in order to use only full data in further analysis
    max_date = df_scraped['dat'].max()
    min_date = df_scraped['dat'].min()
    df_scraped.drop(df_scraped[df_scraped['dat'].isin([max_date, min_date])].index, 
                    inplace=True)
    
    # Delete duplicate rows 
    cols_subset = list(df_scraped.columns[1:7]) # all columns before file_key except 'id_in_source_file'
    row_duplicates = df_scraped.duplicated(subset=cols_subset, keep='first')
    df_scraped.drop(df_scraped[row_duplicates].index, inplace=True)
    
    return df_scraped    

In [15]:
def load_clean_scraped_data(folder):
    """
    Loads all the files from the folder and
    concatenate them in one dataframe.
    
    Performs the following manipulations:
    - Adds columns useful for data grouping
    - Adds a level (whether a row represents an individual
      news story or a whole newscast)
    - Deletes duplicates rows  
    - Deletes the first and the last days, because the loading
      for them is not full
    
    Args: folder - string, a path of a folder with files
    Returns: df_scraped - datafrme
    """
    df_scraped = load_scraped_data(folder)
    df_scraped = add_columns(df_scraped)
    df_scraped = delete_duplicates(df_scraped)
    df_scraped.reset_index(inplace=True, drop=True)
    
    # Update inaccurate video duration values
    if len(df_scraped) >= 337000:
        # https://www.1tv.ru/news/2007-06-20/209380-militsionery_ranenye_v_rezultate_napadeniya_na_bazu_omona_nahodyatsya_v_gospitale
        df_scraped.iloc[348394, 7] = 8
        
        # https://www.1tv.ru/news/2007-07-06/204193-v_krasnoyarskom_krae_ischut_propavshih_turistov_iz_moskvy
        df_scraped.iloc[347534, 7] = 24
        
        # https://www.1tv.ru/news/2007-10-16/203095-tsik_zaregistriroval_federalnye_spiski_esche_3_politicheskih_partiy
        df_scraped.iloc[342323, 7] = 51
        
        # https://www.1tv.ru/news/2008-02-03/196382-brachnye_muzy_nikolya_sarkozi_nakonets_zhenilsya_na_karle_bruni
        df_scraped.iloc[337016, 7] = 181
    
    return df_scraped