## 1. Load essential libraries

In [None]:
import os, json
import pandas as pd
import numpy as np
import regex as re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag_sents
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Upload Data

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = os.getcwd()+'/drive/MyDrive/Fiscal_Policy_Project/Data/Banque_de_France'  

In [None]:
contents = [x for x in os.listdir(path) if not os.path.isfile(os.path.join(path,x))]

# List of all files in a list of tuples (folder, filename)
json_files = []
for x in contents:
    files = os.listdir(os.path.join(path, x))
    for file in files:
        if file.endswith('.json'):
            json_files.append((x, file))

# 3. Create Dataframe

In [None]:
#Create dataframe for files
df = pd.DataFrame(data=json_files, columns=["Filetype", "Filename"])

#get first and last files
df.head().append(df.tail())

  df.head().append(df.tail())


Unnamed: 0,Filetype,Filename
0,Interview,2022-02-22_305.json
1,Interview,2021-06-11_459.json
2,Interview,2013-10-14_448.json
3,Interview,2018-01-26_461.json
4,Interview,2000-11-21_293.json
841,Autres-interventions,2020-05-05_52.json
842,Autres-interventions,2009-01-23_221.json
843,Autres-interventions,2007-10-10_130.json
844,Autres-interventions,2012-03-27_109.json
845,Autres-interventions,2019-11-27_198.json


In [None]:
# Pandas dataframe that will have the contents of file DATE and TEXT along with Folder and name

jsons_data = pd.DataFrame(columns=['DATE', 'AUTHOR', 'TEXT', 'FOLDER', 'FILE NAME'])

for i, file in enumerate(json_files):
    with open(os.path.join(path, file[0], file[1]), 'r', encoding='utf-8') as fd:
      json_text = json.load(fd) 
      
      TEXT = json_text.get('text')
      DATE = json_text.get('published_date')
      AUTHOR = json_text.get('author')
        
      # Push a list of data into a pandas DataFrame at row given by 'index'
      jsons_data.loc[i] = [DATE, AUTHOR, TEXT, file[0], file[1]]
        
jsons_data

Unnamed: 0,DATE,AUTHOR,TEXT,FOLDER,FILE NAME
0,2022-02-22,François Villeroy De Galhau,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...,Interview,2022-02-22_305.json
1,2021-06-11,François Villeroy De Galhau,« On ne change pas une politique monétaire qui...,Interview,2021-06-11_459.json
2,2013-10-14,Christian Noyer,"\n \nInterview de Christian Noyer, Gouverneur...",Interview,2013-10-14_448.json
3,2018-01-26,francois villeroy de galhau,(This is not a legal transcript. Bloomberg LP ...,Interview,2018-01-26_461.json
4,2000-11-21,,BE/FC 2lsl November 2000 INTERVIEW DE M. LE GO...,Interview,2000-11-21_293.json
...,...,...,...,...,...
841,2020-05-05,Sylvie GOULARD,1 \n \nApprenant de la coopération Européenne ...,Autres-interventions,2020-05-05_52.json
842,2009-01-23,,\n \n \n \n \n \n \n \n \n \nCrise financière...,Autres-interventions,2009-01-23_221.json
843,2007-10-10,,\n 1\n \n \n \n \nConférence annuelle de la B...,Autres-interventions,2007-10-10_130.json
844,2012-03-27,,1 \n \n \n \nMISSION COMMUNE D’INFORMATION SUR...,Autres-interventions,2012-03-27_109.json


# 4. Clean Data

In [None]:
# Check for Na
jsons_data['AUTHOR'].isna().sum()

42

In [None]:
def preprocess(txt):
    txt = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", txt)
    txt = txt.lower()
    txt = re.sub(r"\W"," ",txt)
    txt = re.sub(r"\d"," ",txt)
    txt = re.sub(r"\s+[a-z]\s+"," ",txt)
    txt = re.sub(r"\s+[a-z]$"," ",txt)
    txt = re.sub(r"^[a-z]\s+"," ",txt)
    txt = re.sub(r"\s+"," ",txt) 
    return txt


In [None]:
jsons_data['Processed TEXT'] = jsons_data['TEXT'].apply(lambda txt: preprocess(txt))

In [None]:
for index, row in jsons_data.iterrows():
    if row['Processed TEXT'] == '':
        jsons_data.drop(index, inplace=True)

In [None]:
jsons_data['Processed TEXT']

0       libération le février inflation est un peu la...
1       on ne change pas une politique monétaire qui ...
2       interview de christian noyer gouverneur de la...
3       this is not legal transcript bloomberg lp can...
4      be fc lsl november interview de le gouverneur ...
                             ...                        
841     apprenant de la coopération européenne dans l...
842     crise financière mondiale stratégies publique...
843     conférence annuelle de la banque de lettonie ...
844     mission commune information sur le fonctionne...
845     université de rennes novembre quelles sont le...
Name: Processed TEXT, Length: 846, dtype: object

# - 5. Remove Stopwords

In [None]:
stopwords.words('french')

In [None]:
counter=0
def remove_stopwords(text,language,processed,verbose):
    word_list = text.split() 
    filtered_words = [word for word in word_list if word not in stopwords.words(language)]
    text = ' '.join(filtered_words)
    global counter
    counter+=1
    if verbose==1 and counter % 10 == 0:
        print(f"processed {counter}/{len(processed)}")
    if counter >= len(processed):
        counter=0
    return text

In [None]:
jsons_data['Processed TEXT'] = jsons_data['Processed TEXT'].apply(lambda text:remove_stopwords(text,'french',jsons_data['Processed TEXT'],verbose=1))


processed 10/6997
processed 20/6997
processed 30/6997
processed 40/6997
processed 50/6997
processed 60/6997
processed 70/6997
processed 80/6997
processed 90/6997
processed 100/6997
processed 110/6997
processed 120/6997
processed 130/6997
processed 140/6997
processed 150/6997
processed 160/6997
processed 170/6997
processed 180/6997
processed 190/6997
processed 200/6997
processed 210/6997
processed 220/6997
processed 230/6997
processed 240/6997
processed 250/6997
processed 260/6997
processed 270/6997
processed 280/6997
processed 290/6997
processed 300/6997
processed 310/6997
processed 320/6997
processed 330/6997
processed 340/6997
processed 350/6997
processed 360/6997
processed 370/6997
processed 380/6997
processed 390/6997
processed 400/6997
processed 410/6997
processed 420/6997
processed 430/6997
processed 440/6997
processed 450/6997
processed 460/6997
processed 470/6997
processed 480/6997
processed 490/6997
processed 500/6997
processed 510/6997
processed 520/6997
processed 530/6997
pr

In [None]:
with open(path+'/jsons_data_BdF.pickle','wb') as f:
    pickle.dump(jsons_data,f)