In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../Data/questions_table.csv")
data.columns

Index(['question', 'answer', 'answer_type', 'derivation'], dtype='object')

In [3]:
data["answer_type"].unique()

array(['multi-span', 'span', 'arithmetic', 'count'], dtype=object)

In [4]:
dataset = data[data["answer_type"].isin(["span", "arithmetic"])]

In [5]:
len(dataset)

11265

In [6]:
dataset.head()

Unnamed: 0,question,answer,answer_type,derivation
1,How much is the 2019 rate of inflation?,['2.9'],span,
2,How much is the 2018 rate of inflation?,['2.9'],span,
3,What is the 2019 average rate of inflation?,2.9,arithmetic,(2.9+2.9)/2
4,What is the 2019 average rate of increase in s...,2.7,arithmetic,(2.7+2.7)/2
5,What is the difference between 2019 average ra...,0.2,arithmetic,[(2.9+2.9)/2] - [(2.7+2.7)/2]


In [7]:
dataset_n = dataset[["question", "answer_type"]].copy()

In [8]:
dataset_n.head()

Unnamed: 0,question,answer_type
1,How much is the 2019 rate of inflation?,span
2,How much is the 2018 rate of inflation?,span
3,What is the 2019 average rate of inflation?,arithmetic
4,What is the 2019 average rate of increase in s...,arithmetic
5,What is the difference between 2019 average ra...,arithmetic


In [9]:
from sklearn.model_selection import train_test_split
X = dataset_n["question"]
y = dataset_n["answer_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True)

In [10]:
X_train[:5].values

array(['What was the average settlements for 2017-2019?',
       'What was the estimated useful life of Towers in years?',
       'What is the average quarterly high sale price for 2019?',
       'What does the table show?',
       'What was the working capital in 2019?'], dtype=object)

In [11]:
y_train[:5]

12630    arithmetic
7307           span
11472    arithmetic
1573           span
4538           span
Name: answer_type, dtype: object

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X_train)
model = LogisticRegression()
model.fit(X_tfidf, y_train)
X_test_tfidf = vectorizer.transform(X_test)

In [14]:
predictions = model.predict(X_test_tfidf)
i = 0
for text, pred, true_label in zip(X_test, predictions, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


Input: 'What was the change in the Total non-current trade and other payables in 2019 from 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the average hardware revenue from 2016 to 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage change in revenue generated from Partner C from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'As of March 29, 2019, What is Intangible assets expressed as a percentage of  Gross deferred tax assets?' --> Prediction: arithmetic --> True Label: arithmetic


In [15]:
import os

In [16]:
import pickle

In [17]:
def save_model(data, file_name):
    os.makedirs("../artifacts/model", exist_ok = True)
    try:
        file_path = f"../artifacts/model/{str(file_name)}.pkl"
        with open(file_path, "wb") as file:
            pickle.dump(data, file)
        print(f"Data saved succesfully at: {file_path}")
    except Exception as e:
        raise e

In [18]:
save_model(data = vectorizer, file_name = "vectorizer")
save_model(data = model, file_name = "model")

Data saved succesfully at: ../artifacts/model/vectorizer.pkl
Data saved succesfully at: ../artifacts/model/model.pkl
