# Load data and python libraries

In [None]:
# make plot s appear after the code cell
%matplotlib inline 

# data processing libraries
import pandas as pd

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#detect language
from langdetect import detect

# supporting libraries
import re
import pickle

In [None]:
# file location of the data
input_folder = './data/'
output_folder = './transition_files/'

file_name = 'all-the-news-2-1.csv'

In [None]:
# load data
df_data = pd.read_csv(input_folder + file_name, #file location
                      encoding = "ISO-8859-1", #deal with texts in different formats
                     )

# display first row of the data frame
print(df_data.shape)
df_data.head(1).T

# Data Exploration and Cleaning

In [None]:
def check_data (df):
    """
    check data types of values in the pandas data frame and number of missing values
    
    input:
        df - as pandas data frame to analyze
    
    output:
        pandas data frame with column name, 
                                column data type, 
                                the data type of actual value in the column
                                number of missing values
                                value example
    """
    df_data_types = []
    actual_data_types = []
    num_missing = []
    values = []
    columns = list(df_data.columns)
    
    for column in df_data.columns:
        #selecting only non missing values in the column
        df_tmp = df[df[column].isnull() == False]
        
        #count number of missing values
        num_missing.append(len(df) - len(df_tmp))
        
        #getting column data type
        dtype = str(df_tmp[column].dtypes)
        df_data_types.append(dtype)
                
        #getting data type of an actual value
        actual_value = df_tmp[column].iloc[0]
        m = re.search("'.+'", str(type(actual_value)))
        if m:
            dtype = m.group(0)
        else:
            dtype =  ''   
        actual_data_types.append(dtype)
        values.append(actual_value)
        
    #create data frame with data types comparison
    df_result = pd.DataFrame({
                              'data type': df_data_types,
                              'actual data type': actual_data_types,
                              'number of missing values': num_missing,
                              '% of missing values': [round(n / len(df) * 100,2) for n in num_missing],
                              'value example': values
                             }, index=columns)
    return df_result

In [None]:
# checking data quality
print('datatype = "object" means the column has string and/or missing values in it.')
check_data(df_data)

**NOTE:<br>We have >60% of articles that are assigned to some section in a paper. So we can use some for model validattion.**

***
## Section data

In [None]:
print("NOTE: section data is noisy!")
print("Number of sections per publication:")
s = pd.DataFrame(df_data.groupby('publication')['section'].nunique())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

In [None]:
print("Number of articles per section:")
s = pd.DataFrame(df_data.groupby('section')['article'].count())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

In [None]:
#calculate number_of_sections per publication
df_data['number_of_sections'] = df_data.groupby('publication')['section'].transform("nunique")
df_data[['publication', 'section', 'number_of_sections']].head()

In [None]:
lower_thr = 10
upper_thr = 50
print("Take only publications with reasonable number of sections (%2d-%2d)"%(lower_thr, upper_thr))

df_test = df_data[(df_data['number_of_sections'] >= lower_thr) &
                  (df_data['number_of_sections'] <= upper_thr)
                 ]
print("Number of articles:", len(df_test), " out of", len(df_data))
print("\nPublications:", set(df_test['publication']))
print("\nSections:\n", set(df_test['section']))
s = pd.DataFrame(df_test.groupby('publication')['section'].nunique())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

**Sections to consider:**
- music
- culture 
- schools-brief
- business
- awards
- travel | outdoor
- sports
- real-estate
- politics
- tech
- economic-indicators | finance-and-economics | economic-and-financial-indicators |  
- health


etc. (needs to be discussed)

In [None]:
print("Number of articles per section IN SELECTED PUBLICATIONS:")
s = pd.DataFrame(df_test.groupby('section')['article'].count())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

## Text quality

In [None]:
# calculate text length of each article in symbols
df_data["text_length"] = df_data['article'].fillna("").apply(len)

#look at descriptive statistic
pd.DataFrame(df_data["text_length"].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99])).T

In [None]:
#Example: text with less than 150 symbols
df_data[df_data["text_length"] < 150]['article'].iloc[0]

In [None]:
# delete reviews with length less than 10th percentile and more than 95th percentile of the sample
#since they are suspiciously short or long
#calculate length percentiles
pct10 = df_data["text_length"].quantile(0.10)
pct95 = df_data["text_length"].quantile(0.95)

print('minimum length: ',df_data["text_length"].min())
print('maximum  length: ',df_data["text_length"].max())
print('\n10th percentile: ', pct10, '\n95th percentile: ', pct95)

#delete suspicious values
print('\n\nData size before deletion: ', len(df_data))
df_data = df_data[(df_data["text_length"] >= pct10) & (df_data["text_length"] <= pct95)]
print('Data size after deletion:  ', len(df_data))

In [None]:
# look at descriptive statistics
print("Total number of observations: ", len(df_data))
pd.DataFrame(df_data["text_length"].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99])).T

**NOTE:** Articles with missing text were deleted from the data

In [None]:
#test languages of every 500th article
def define_language(df, column):
    # detect languages for each text in the column
    lang_list = []
    for i in range(len(df)):
        text = df[column].iloc[i]
        try:
            language = detect(text)
        except:
            language = "error"
        lang_list.append(language)

    return lang_list

#############################################################
df_test = df_data.iloc[::500]
df_test['article_language'] = define_language(df_test, "article") 
df_test['article_language'].value_counts()

## Date

In [None]:
# get article posting date in python format where possible 
#NOTE: if string can not be converted it is replaced with missing value NaT
df_data['py_date'] = pd.to_datetime(df_data['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_data[['date', 'py_date']].head()

In [None]:
df_data['py_date'].describe()

In [None]:
print("Number of articles per year:")
df_data['py_year'] = df_data['py_date'].dt.year
df_data['py_year'].value_counts()

In [None]:
print("Number of articles per month:")
df_data['py_month'] = df_data['py_date'].dt.month
df_data['py_month'].value_counts().sort_index()

In [None]:
print("Number of months per year:")
print(df_data.groupby("py_year")['py_month'].nunique().sort_index())

print("\nCovered months in 2020:", set(df_data[df_data['py_year'] == 2020]['py_month']))

In [None]:
set(df_data[df_data['py_year'] == 2020]['py_month'])

***
# Conclusions:
- We have enough data with predefined labels to test Topic Modeling algorithm.
- Only first 4 months are covered in 2020 (if it has any importance to capture Covid-19 news)
- There are non-English articles! (need to clean that)
