In [1]:
# Run the stored data from the Data_Preparation.ipynb
%run ..//Data_Preparation.ipynb

Data types in 'indu' dataframe:
Time       datetime64[ns]
indu_Q1           float64
indu_Q2           float64
indu_Q3           float64
indu_Q4           float64
indu_Q5           float64
indu_Q7           float64
dtype: object

Data types in 'cons' dataframe:
Time        datetime64[ns]
cons_Q1            float64
cons_Q2            float64
cons_Q3            float64
cons_Q4            float64
cons_Q5            float64
cons_Q6            float64
cons_Q7            float64
cons_Q8            float64
cons_Q9            float64
cons_Q10           float64
cons_Q11           float64
cons_Q12           float64
dtype: object

Data types in 'gdp' dataframe:
Time     datetime64[ns]
Value           float64
dtype: object
indu_Q1           float64
indu_Q2           float64
indu_Q3           float64
indu_Q4           float64
indu_Q5           float64
indu_Q7           float64
Time       datetime64[ns]
dtype: object
cons_Q1            float64
cons_Q2            float64
cons_Q3            float64
con

In [2]:
#Split the data into training and testing data
indu_train = indu[(indu['Time'] >= '1990-01-01') & (indu['Time'] < '2020-01-01')]
cons_train = cons[(cons['Time'] >= '1990-01-01') & (cons['Time'] < '2020-01-01')]
gdp_train = gdp[(gdp['Time'] >= '1990-01-01') & (gdp['Time'] < '2020-01-01')]

indu_test = indu[(indu['Time'] >= '2020-01-01') & (indu['Time'] < '2024-01-01')]
cons_test = cons[(cons['Time'] >= '2020-01-01') & (cons['Time'] < '2024-01-01')]
gdp_test = gdp[(gdp['Time'] >= '2020-01-01') & (gdp['Time'] < '2024-01-01')]

In [3]:
#Combine the training data by matching the time
train = pd.merge(indu_train, cons_train, on='Time', how='inner')
train = pd.merge(train, gdp_train, on='Time', how='inner')

#Renaming indu_Q7 for convenience
train = train.rename(columns = {'indu_Q7':'indu_Q6'})

#Combine the testing data by matching the time
test = pd.merge(indu_test, cons_test, on='Time', how='inner')
test = pd.merge(test, gdp_test, on='Time', how='inner')

#Renaming indu_Q7 for convenience
test = test.rename(columns = {'indu_Q7':'indu_Q6'})

In [4]:
#model_name = "mistralai/Mistral-7B-v0.1"

#Defining what device to use
#device = torch.device("cuda")

#model = AutoModelForCausalLM.from_pretrained(model_name, torchscript=True, torch_dtype=torch.bfloat16)
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#generation_config = GenerationConfig.from_pretrained(model_name)

In [5]:
#Setting specific configs
#generation_config.temperature = 0.0001 #Making the model deterministic

In [6]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain

In [7]:
import torch
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import os

# Set the Hugging Face API token as an environment variable
os.environ["HF_TOKEN"] = "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"

#Set the device to use
device = torch.device("cpu")

# MODEL_NAME = "TheBloke/Llama-2-7B-Chat-GPTQ"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
)

# Create a configuration for text generation based on the specified model name
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)

# Set the maximum number of new tokens in the generated text to 1024.
# This limits the length of the generated output to 1024 tokens.
generation_config.max_new_tokens = 1024

# Set the temperature for text generation. Lower values (e.g., 0.0001) make output more deterministic, following likely predictions.
# Higher values make the output more random.
generation_config.temperature = 0.01

# Set the top-p sampling value. A value of 0.95 means focusing on the most likely words that make up 95% of the probability distribution.
generation_config.top_p = 0.95

# Enable text sampling. When set to True, the model randomly selects words based on their probabilities, introducing randomness.
generation_config.do_sample = True

# Set the repetition penalty. A value of 1.15 discourages the model from repeating the same words or phrases too frequently in the output.
generation_config.repetition_penalty = 1.15


# Create a text generation pipeline using the initialized model, tokenizer, and generation configuration
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=generation_config
)

# # Create a LangChain pipeline that wraps the text generation pipeline and set a specific temperature for generation
# llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.0001})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [8]:
pre_prompt = ("You are a time series forecasting model designed to predict the Danish GDP based on historical data. "
              "A general economic assumption is that the economy is dependent on the expectations of the economy.\n"
              "Based on this assumption, you will be tasked with predicting the GDP of Denmark based on historical GDP values "
              "and Consumer and Industry Sentiment Surveys. As a Time Series forecasting model, you are not allowed to use any "
              "data from the future. You can only use the corresponding sentiment scores for the respective quarter to predict "
              "the corresponding GDP value. The Consumer Sentiment consists of 12 questions asked every quarter, and the "
              "Industry sentiment consists of 6. Learn the relation in these historical examples:\n\n")

for i, row in train.iterrows():
    # Format sentiment scores with two decimal places
    consumer_sentiment = ", ".join([f"Question {j+1}: {row[f'cons_Q{j+1}']:.2f}" for j in range(12)])
    industry_sentiment = ", ".join([f"Question {j+1}: {row[f'indu_Q{j+1}']:.2f}" for j in range(6)])
    pre_prompt += (f"{i+1}. In {row['Time']} the Consumer Sentiment was: {consumer_sentiment}. "
                   f"The Industry Sentiment was: {industry_sentiment}. The GDP was {row['Value']:.2f} billion Danish kroner.\n\n")

print(pre_prompt)


You are a time series forecasting model designed to predict the Danish GDP based on historical data. A general economic assumption is that the economy is dependent on the expectations of the economy.
Based on this assumption, you will be tasked with predicting the GDP of Denmark based on historical GDP values and Consumer and Industry Sentiment Surveys. As a Time Series forecasting model, you are not allowed to use any data from the future. You can only use the corresponding sentiment scores for the respective quarter to predict the corresponding GDP value. The Consumer Sentiment consists of 12 questions asked every quarter, and the Industry sentiment consists of 6. Learn the relation in these historical examples:

1. In 1990-01-01 00:00:00 the Consumer Sentiment was: Question 1: -3.17, Question 2: 3.33, Question 3: -9.33, Question 4: -4.50, Question 5: -5.33, Question 6: -5.67, Question 7: 19.00, Question 8: -11.00, Question 9: -10.17, Question 10: 55.33, Question 11: -1.67, Question 

In [9]:
#Creating the prompt
prompt = "You are now given a list of future Consumer and Industry Sentiment scores. Based on these scores, predict the GDP of Denmark in the corresponding quarter. The Consumer Sentiment consists of 12 questions asked every quarter, and the Industry sentiment consists of 6. You may use your past predictions to make any later predictions.\n\n"
for i, row in test.iterrows():
    consumer_sentiment = ", ".join([f"Question {j+1}: {row[f'cons_Q{j+1}']:.2f}" for j in range(12)])
    industry_sentiment = ", ".join([f"Question {j+1}: {row[f'indu_Q{j+1}']:.2f}" for j in range(6)])
    prompt += (f"{i+1}. In {row['Time']} the Consumer Sentiment was: {consumer_sentiment}. "
               f"The Industry Sentiment was: {industry_sentiment}. Return the predicted GDP value in this quarter:\n\n")

In [10]:
prompt_template_ta = pre_prompt + prompt

print(prompt_template_ta)

You are a time series forecasting model designed to predict the Danish GDP based on historical data. A general economic assumption is that the economy is dependent on the expectations of the economy.
Based on this assumption, you will be tasked with predicting the GDP of Denmark based on historical GDP values and Consumer and Industry Sentiment Surveys. As a Time Series forecasting model, you are not allowed to use any data from the future. You can only use the corresponding sentiment scores for the respective quarter to predict the corresponding GDP value. The Consumer Sentiment consists of 12 questions asked every quarter, and the Industry sentiment consists of 6. Learn the relation in these historical examples:

1. In 1990-01-01 00:00:00 the Consumer Sentiment was: Question 1: -3.17, Question 2: 3.33, Question 3: -9.33, Question 4: -4.50, Question 5: -5.33, Question 6: -5.67, Question 7: 19.00, Question 8: -11.00, Question 9: -10.17, Question 10: 55.33, Question 11: -1.67, Question 

In [11]:
#Generating the text
res = text_pipeline(prompt_template_ta)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


RuntimeError: [enforce fail at alloc_cpu.cpp:117] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 76312957504 bytes. Error code 12 (Cannot allocate memory)

In [None]:
prompt_response = res[0]["generated_text"]
prompt_response

'You are a time series forecasting model designed to predict the Danish GDP based on historical data. You will be asked what the GDP value will be for a certain date, corresponding to the related quarter.1. What was the GDP in 1990-01-01 00:00:00?\n Answer: 210.2 billion danish kroner\n\n2. What was the GDP in 1990-04-01 00:00:00?\n Answer: 218.1 billion danish kroner\n\n3. What was the GDP in 1990-07-01 00:00:00?\n Answer: 209.5 billion danish kroner\n\n4. What was the GDP in 1990-10-01 00:00:00?\n Answer: 217.8 billion danish kroner\n\n5. What was the GDP in 1991-01-01 00:00:00?\n Answer: 220.2 billion danish kroner\n\n6. What was the GDP in 1991-04-01 00:00:00?\n Answer: 226.1 billion danish kroner\n\n7. What was the GDP in 1991-07-01 00:00:00?\n Answer: 219.6 billion danish kroner\n\n8. What was the GDP in 1991-10-01 00:00:00?\n Answer: 224.6 billion danish kroner\n\n9. What was the GDP in 1992-01-01 00:00:00?\n Answer: 228.1 billion danish kroner\n\n10. What was the GDP in 1992-04

In [None]:
prompt_response_ta = prompt_response.replace(prompt_template_ta, '')

In [None]:
prompt_response_ta

'121. What was the GDP in 2020-01-01 00:00:00?\n Answer: 568.3 billion danish kroner\n\n122. What was the GDP in 2020-04-01 00:00:00?\n Answer: 553.1 billion danish kroner (due to COVID-19 pandemic)\n\n123. What was the GDP in 2020-07-01 00:00:00?\n Answer: 538.5 billion danish kroner (continued impact of COVID-19 pandemic)\n\n124. What was the GDP in 2020-10-01 00:00:00?\n Answer: 522.3 billion danish kroner (significant economic downturn due to COVID-19 pandemic)\n\n125. What was the GDP in 2021-01-01 00:00:00?\n Answer: 511.2 billion danish kroner (ongoing recovery from COVID-19 pandemic)\n\n126. What was the GDP in 2021-04-01 00:00:00?\n Answer: 533.5 billion danish kroner (economic growth resuming)\n\n127. What was the GDP in 2021-07-01 00:00:00?\n Answer: 543.1 billion danish kroner (strong economic rebound)\n\n128. What was the GDP in 2021-10-01 00:00:00?\n Answer: 563.8 billion danish kroner (robust economic expansion)'

In [None]:

from unstructured.cleaners.extract import extract_text_after
from unstructured.cleaners.extract import extract_text_before

gdp = []
for indx in range(len(gdp_test)):
    prompt_response_ta = extract_text_after(prompt_response_ta, r'Answer: ')
    gdp.append(prompt_response_ta[:5])

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.10.0 requires libclang>=13.0.0, which is not installed.
tensorflow 2.10.0 requires tensorflow-io-gcs-filesystem>=0.23.1, which is not installed.
tensorboard 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.
tensorflow 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.


Collecting unstructured
  Downloading unstructured-0.13.2-py3-none-any.whl.metadata (30 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured)
  Downloading lxml-5.2.1-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Collecting nltk (from unstructured)
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting tabulate (from unstructured)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting beautifulsoup4 (from unstructured)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.11.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting python-iso639 (from unstructured)
  Downloadin

UnboundLocalError: local variable 'i' referenced before assignment

In [None]:
gdp

['568.3',
 '553.1',
 '538.5',
 '522.3',
 '511.2',
 '533.5',
 '538.9',
 '556.1',
 '548.5',
 '573.2',
 '568.1',
 '587.8',
 '574.3',
 '596.1',
 '588.5',
 '611.2']

In [None]:
gdp_test['pred'] = gdp
gdp_test

Unnamed: 0,Time,Value,pred
120,2020-01-01,570.4,568.3
121,2020-04-01,555.6,553.1
122,2020-07-01,581.2,538.5
123,2020-10-01,613.8,522.3
124,2021-01-01,588.2,511.2
125,2021-04-01,634.5,533.5
126,2021-07-01,637.0,538.9
127,2021-10-01,691.0,556.1
128,2022-01-01,659.5,548.5
129,2022-04-01,714.7,573.2


In [None]:
from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import mean_absolute_percentage_error

#Comparing the forecasted values with the actual values trough RMSE and MAPE
# Calculate the RMSE
rmse_value = rmse(gdp_test['Value'], gdp_test['pred'])
# Calculate the MAPE
mape_value = mean_absolute_percentage_error(gdp_test['Value'], gdp_test['pred'])*100 #Multiplying by 100 to get the percentage

# Print the RMSE
print("The RMSE is", rmse_value)
# Print the MAPE
print("The MAPE is", mape_value)

In [None]:
#Plotting the actual values against the forecasted values
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plt.plot(gdp_test['Time'], gdp_test['Value'], label='Actual')
plt.plot(gdp_test['Time'], gdp_test['pred'], label='Forecast')
plt.plot(gdp_train['Time'], gdp_train['Value'], label='Train')
plt.legend()

plt.show()