## Downloading Dataset

In [12]:
from mit_d3m import load_dataset # Importing Dataset

thirty_two = load_dataset('32_wikiqa') # Loading Dataset

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Importing Libraries

In [13]:
import numpy as np 
import pandas as pd
import warnings

### Data Preprocessing

In [14]:
from sdv import Metadata
from sklearn.model_selection import train_test_split, KFold, cross_val_score # Importing Library for Splitting the Dataset into Train & Test Set

### Models

In [15]:
from sklearn.linear_model import LogisticRegression # Importing 'LogistocRegression' Machine Learning Model
from sdv.relational import HMA1 # Importing 'Hierarchical Modeling Algorithm' Model

### Evaluation

In [48]:
from sklearn.model_selection import KFold # Importing 'KFold' for Evaluating Model
from sklearn.model_selection import cross_val_score# Importing 'cross_val_score' for Calculating Model Score
from sklearn.metrics import accuracy_score # Importing 'accuracy_score' Class for Model Evaluation 
from numpy import mean, absolute # Importing 'mean' & 'absolute' to Calculate Accuracy Score
from sdv.metrics.relational import KSTest, CSTest # Importing 'KSTestExtended' Class for Synthetic Data Evaluation

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Removing Warning Messages

In [17]:
warnings.filterwarnings(action = 'ignore') # Removing Warning Messages

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Predictor & Response Variables

In [18]:
X = thirty_two.X # Extracting Feature Variables
y = thirty_two.y # Extracting Response Varaiable

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Relationship Between Datasets

In [19]:
thirty_two.context

{'target_entity': 'learningData',
 'entities': {'learningData': (          d3mIndex  qIndex  sIndex
   d3mIndex                          
   0                0       0       0
   1                1       0       1
   2                2       0       2
   4                4       0       4
   5                5       1       5
   ...            ...     ...     ...
   29253        29253    3046   26191
   29254        29254    3046   26192
   29255        29255    3046   26193
   29256        29256    3046   26194
   29257        29257    3046   26195
   
   [23406 rows x 3 columns],
   'd3mIndex',
   None),
  'questions': (        qIndex                                           question
   qIndex                                                           
   0            0                     how are glacier caves formed ?
   1            1  How are the directions of the velocity and for...
   2            2                           how did apollo creed die
   3            3           

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Importing Datasets

In [20]:
learning_data = pd.concat([X, y], axis = 1) # Learning Dataset
questions = pd.read_csv('Data/questions.csv') # Questions Dataset
answers = pd.read_csv('Data/sentences.csv') # Answers Dataset

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Data Preprocessing

### Initializing Parent Table

In [21]:
metadata = Metadata() # Creating an Instance of the Class

In [22]:
metadata.add_table(name = 'learningQ', data = learning_data, primary_key = 'qIndex') # Questions Dataset
metadata.add_table(name = 'learningS', data = learning_data, primary_key = 'sIndex') # Answers Dataset

In [23]:
metadata.get_table_meta('learningQ') # Printing Parent Questions Metadata

{'fields': {'d3mIndex': {'type': 'numerical', 'subtype': 'integer'},
  'qIndex': {'type': 'id', 'subtype': 'integer'},
  'sIndex': {'type': 'numerical', 'subtype': 'integer'},
  'isAnswer': {'type': 'numerical', 'subtype': 'integer'}},
 'primary_key': 'qIndex'}

In [24]:
metadata.get_table_meta('learningS') # Printing Parent Answers Metadata

{'fields': {'d3mIndex': {'type': 'numerical', 'subtype': 'integer'},
  'qIndex': {'type': 'numerical', 'subtype': 'integer'},
  'sIndex': {'type': 'id', 'subtype': 'integer'},
  'isAnswer': {'type': 'numerical', 'subtype': 'integer'}},
 'primary_key': 'sIndex'}

### Initializing Child Table

In [25]:
metadata.add_table(name = 'questions', data = questions, primary_key = 'qIndex', parent = 'learningQ', foreign_key = 'qIndex') # Questions
metadata.add_table(name = 'answers', data = answers, primary_key = 'sIndex', parent = 'learningS', foreign_key = 'sIndex') # Answers

In [26]:
metadata # Printing Dataset Relationship

Metadata
  root_path: .
  tables: ['learningQ', 'learningS', 'questions', 'answers']
  relationships:
    questions.qIndex -> learningQ.qIndex
    answers.sIndex -> learningS.sIndex

### Creating Dataset Dictionary 

In [27]:
tables = {'learningQ': learning_data, 'learningS': learning_data,
          'questions': questions, 'answers': answers} # Initializing the Dictionary 

In [28]:
tables # Printing the Dictionary

{'learningQ':           d3mIndex  qIndex  sIndex  isAnswer
 d3mIndex                                    
 0                0       0       0         0
 1                1       0       1         0
 2                2       0       2         0
 4                4       0       4         0
 5                5       1       5         0
 ...            ...     ...     ...       ...
 29253        29253    3046   26191         0
 29254        29254    3046   26192         0
 29255        29255    3046   26193         0
 29256        29256    3046   26194         0
 29257        29257    3046   26195         0
 
 [23406 rows x 4 columns],
 'learningS':           d3mIndex  qIndex  sIndex  isAnswer
 d3mIndex                                    
 0                0       0       0         0
 1                1       0       1         0
 2                2       0       2         0
 4                4       0       4         0
 5                5       1       5         0
 ...            ...     .

### Extracting Predictor & Response Variable 

In [29]:
X = learning_data.iloc[:, :-1] # Extracting Predictor Variables
y = pd.DataFrame(learning_data.iloc[:, -1]) # Extracting Response Variables

### Training & Test Set Split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## XG Boost Classification Model

In [31]:
model = LogisticRegression() # Initializing the Classification Model
model.fit(X_train, y_train) # Traing the Model on Original Datapoints

LogisticRegression()

### Evaluating Model

In [32]:
cv = KFold(n_splits=10, random_state = 1, shuffle = True) # Defining Cross-Validation Method 

scores = cross_val_score(model, X, y, cv = cv, n_jobs = -1) # Evaluating the Model
print('Average Accuracy of the Model -> ', round(mean(absolute(scores)) * 100), '%') # Printing Results

Average Accuracy of the Model ->  95 %


**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Generating Synthetic Data

In [1]:
syn_model = HMA1(metadata) # Creating an Instance of the Class
syn_model.fit(tables) # Applying the Model on the Data 

NameError: name 'HMA1' is not defined

#### Saving The Synthetic Data Generator Model

In [25]:
syn_model.save('Model.pkl')

#### Loading The Saved Synthetic Data Generator Model

In [34]:
loaded = HMA1.load('Model.pkl')

In [35]:
syn_data = loaded.sample() # Generating Synthetic Data
syn_data # Printing the Generated Meta Data

{'learningQ':        d3mIndex  qIndex  sIndex  isAnswer
 0          6743   23406    5898         0
 1         27886   23407   24831         0
 2         10778   23408    8147         0
 3         11251   23409    8681         0
 4         11512   23410   13184         0
 ...         ...     ...     ...       ...
 23401     18395   46807   15882         0
 23402     16988   46808   18394         0
 23403     15809   46809   10663         0
 23404     21842   46810   18109         0
 23405      7356   46811    8030         0
 
 [23406 rows x 4 columns],
 'questions':        qIndex                                           question
 0       23406                  what cords are used for in wiccan
 1       23407  How is constitutionalism used in the American ...
 2       23408                                how far is a league
 3       23409           what is the average wear time for braces
 4       23410      what made the civil war different from others
 ...       ...                   

#### Data Pre-Processing

In [36]:
data = syn_data['learningQ'] # Extracting Testing Dataset
data # Printing Results

Unnamed: 0,d3mIndex,qIndex,sIndex,isAnswer
0,6743,23406,5898,0
1,27886,23407,24831,0
2,10778,23408,8147,0
3,11251,23409,8681,0
4,11512,23410,13184,0
...,...,...,...,...
23401,18395,46807,15882,0
23402,16988,46808,18394,0
23403,15809,46809,10663,0
23404,21842,46810,18109,0


In [41]:
syn_X = data.iloc[:, :-1] # Extracting Predictor Variables
syn_y = data.iloc[:, -1] # Extracting Response Variables

#### Evaluating Synthetic Instances

**Testing Generated Synthetic Data On The Model Trained with Original Datapoints**

In [42]:
syn_acc = model.predict(syn_X) # Prediction
print("Average Accuracy of the Model-> ", round(accuracy_score(syn_acc, syn_y)*100), "%") # Average Accuracy Score of the Model

Average Accuracy of the Model->  2 %


**Statistical Metrics**

In [49]:
KS_test = KSTest.compute(tables, syn_data) # Inverted Kolmogorov-Smirnov Test
CS_test = CSTest.compute(tables, syn_data) # Chi-Squared Test

In [50]:
evaluation = [KS_test, CS_test] # Creating a List of the Results
evaluation = pd.DataFrame(evaluation) # Creating a Data Frame of the Results
evaluation.columns = ['Result'] # Gicing Column Name
evaluation.rename({0:'Inverted Kolmogorov-Smirnov Test', 1:'Chi-Squared Test'}) # Renaming Row Name

Unnamed: 0,Result
Inverted Kolmogorov-Smirnov Test,0.388112
Chi-Squared Test,1.0


In [51]:
p_CS_test = 1-CS_test # Calculating 'p-value' for Chi-Squared Test
p_KS_test = 1-KS_test # Calculating 'p-value' for Inverted Kolmogorov-Smirnov Test

In [52]:
p_value = [p_CS_test, p_KS_test] # Creating a List of the Results
p_value = pd.DataFrame(p_value) # Creating a Data Frame of the Results
p_value.columns = ['P Values'] # Gicing Column Name
p_value.rename({0:'Chi-Squared P Value', 1:'Inverted Kolmogorov-Smirnov P Value'}) # Renaming Row Name

Unnamed: 0,P Values
Chi-Squared P Value,0.0
Inverted Kolmogorov-Smirnov P Value,0.611888


**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**