In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import os

file_path = Path.cwd().joinpath("Data", "mpr_data_merged")

## NLTK ##
from nltk.corpus import stopwords

# Import and Clean

In [2]:
## Read in the MPR data file ##
df = pd.read_csv(file_path.joinpath('mpr_full_R2D.csv'))

# Adjustments to date format
df['mpr'] = pd.to_datetime(df['mpr'])
# select only data with dates starting in 2000
df = df[df['mpr'] >= "2000-01"].reset_index().drop('index', axis=1)

"""
Re-classify:
0 = negative
1 = neutral
2 = positive
"""
df['class'] = df['class'] - 1

# FORMATTING DATE
df['mpr'] = df['mpr'].dt.strftime('%Y-%m') # keep year month
df['mpr'] = pd.PeriodIndex(df.mpr, freq='Q') # turn to quarterly time index
 # create a column that tracks quarter, save as int8 type
df['quarter'] = df['mpr'].dt.quarter.astype('int8') 

# create a year column that only has the year
df['year'] = df['mpr'].astype('str')
# remove Q and proceding strings, save as int16 type
df['year'] = df['year'].apply(lambda x: re.sub('Q.*','', x)).astype('int16')

df['raw_text'] = df['text'] # this will be the uneditted text

In [3]:
df.head()

Unnamed: 0,text,class,mpr,wordcount,NPositiveWords,NNegativeWords,NNeutralWords,NUncertainWords,NStrongWords,NWeakWords,...,Poswords,Negwords,Neuwords,Unwords,Strongwords,Weakwords,Conwords,quarter,year,raw_text
0,Information received since the last Monetary P...,2,2000Q1,41,1,0,41,0,0,0,...,stronger,,information received since the last monetary p...,,,,,1,2000,Information received since the last Monetary P...
1,With the further strengthening of global deman...,2,2000Q1,25,1,0,25,0,0,0,...,strengthening,,with the further strengthening of global deman...,,,,,1,2000,With the further strengthening of global deman...
2,"This has been particularly true for oil, lumbe...",2,2000Q1,18,0,0,17,0,0,0,...,,,this has been particularly true for oil lumber...,,,,constraints,1,2000,"This has been particularly true for oil, lumbe..."
3,Higher crude oil prices have led to higher ene...,0,2000Q1,25,0,0,25,0,0,0,...,,,higher crude oil prices have led to higher ene...,,,,,1,2000,Higher crude oil prices have led to higher ene...
4,"As yet, however, these countries have not seen...",1,2000Q1,26,0,0,26,0,0,0,...,,,as yet however these countries have not seen a...,,,,,1,2000,"As yet, however, these countries have not seen..."


In [5]:
def remove_numbers(text):
    # Use regular expression to match and remove numbers
    text_without_numbers = re.sub(r'\d+', '', text)
    return text_without_numbers

# RUN THIS LINE TWICE (REMOVES EXTRA WHITE SPACE)
df['text'] = df['text'].apply(lambda sentence: remove_numbers(sentence))

### Export data for deep models 

Keep **stop words**.

In [11]:
### EXPORT DATA ###
# df.to_csv(file_path.joinpath("data_deep_model_ready_V3.csv"), index=False)

# Preprocessing Data

In [7]:
negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
          "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
          "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
          "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
          "werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
          "wouldn't", "rarely", "seldom", "despite", "no", "nobody"]


selected_stopwords = [x for x in stopwords.words('english') if x not in negate]

In [8]:
# Make all words LOWER CASE
df['text'] = df['text'].apply(lambda w: w.lower())

# Remove PUNCTUATION
df['text'] = df['text'].str.replace('[^A-Za-z0-9]+', ' ', regex=True)

## Remove STOPWORDS ##
def remove_stopwords(sentence, stopwords):
    stopwords_removed = ' '
    non_stopwords = []
    for word in sentence.split():
        if word not in stopwords:
            non_stopwords.append(word)
    return stopwords_removed.join(non_stopwords)

df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, selected_stopwords)) # remove stopwords

### Export data ready for ML modelling

Removed **stop words**

In [10]:
### EXPORT DATA ###
# df.to_csv(file_path.joinpath("data_model_ready_V3.csv"), index=False)