In [64]:
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim
import sklearn
import tqdm
import copy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

In [3]:
# Read embeddings for entity and relations for animals dataset
ent = pd.read_csv('ConEx_entity_embeddings.csv')
rel = pd.read_csv('ConEx_relation_embeddings.csv')

In [4]:
ent.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,0,0.280981,-0.049617,-0.003215,-0.483546,-0.123656,0.564292,0.432948,0.215064,-0.155113,...,0.127948,-0.174766,-0.222745,-0.103216,-0.015835,-0.207434,-0.335994,-0.205721,0.166335,-0.027663
1,2,0.259497,-0.124253,0.516882,-0.507972,0.049873,-0.423409,0.301144,0.041155,-0.204702,...,-0.203218,0.163124,-0.267435,-0.214824,0.091249,-0.208819,-0.376532,-0.185556,0.006606,0.095279
2,22-rdf-syntax-ns#nil,0.030401,-0.38087,0.387651,-0.021696,0.417615,0.25437,0.214374,0.199189,-0.328048,...,-0.338145,-0.275059,0.547209,-0.196628,-0.119073,-0.189195,-0.154224,-0.347735,-0.346463,-0.072465
3,4,0.162839,0.023307,-0.004272,0.258616,0.18264,0.296118,0.34232,0.231499,-0.152109,...,-0.249842,0.077031,-0.241548,-0.195538,-0.170217,-0.172525,-0.368295,-0.326116,-0.198746,0.095087
4,Air,0.280026,0.329111,0.35139,-0.206114,0.151328,0.318364,0.217821,-0.110135,0.014867,...,-0.103858,-0.237071,-0.178284,-0.067466,-0.203515,-0.179597,-0.09736,-0.358156,0.012783,0.263368


In [5]:
ent.shape

(141, 41)

In [6]:
rel.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,22-rdf-syntax-ns#first,0.249388,-0.042922,-0.122261,-0.847049,0.265521,-0.274429,0.199557,0.32065,-0.461976,...,-0.174452,-0.250959,0.013102,-0.72028,-0.839384,0.428789,-0.575384,0.271931,0.039657,0.689879
1,22-rdf-syntax-ns#rest,0.99615,-0.03518,0.14827,0.10984,0.319888,-0.135079,0.572711,-0.007515,-0.169387,...,-0.342145,-0.175748,-0.329169,-0.38017,-0.275613,-0.293082,-0.703566,0.427364,-0.377466,0.998981
2,22-rdf-syntax-ns#type,-0.046696,-0.156884,0.145566,0.240592,0.047457,0.111323,-0.675762,-0.219875,0.263582,...,0.626799,-0.090546,0.241027,0.074701,0.174568,0.020676,0.337071,-0.140033,-0.096334,-0.118992
3,owl#hasValue,-0.29027,-0.367704,-0.098322,-0.260665,-0.219107,0.580961,0.344901,-0.437576,0.536663,...,-0.095163,-0.410139,0.019402,-0.42746,-0.244651,-0.108574,-0.184841,0.215328,0.509272,-0.233955
4,owl#members,-0.974812,-0.194855,-0.282236,-0.13011,-0.262638,0.086012,0.493479,0.023571,0.298599,...,-0.117912,0.397806,-0.207,0.214214,0.226637,0.072036,-0.015147,-0.174736,-0.434278,-0.35991


In [7]:
rel.shape

(11, 41)

In [8]:
rel.max()
rel.min()
feature_cols = rel.columns[1:]

In [9]:
rel_Y = rel['Unnamed: 0']
rel_X = rel[feature_cols]
rel_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.249388,-0.042922,-0.122261,-0.847049,0.265521,-0.274429,0.199557,0.32065,-0.461976,-0.031371,...,-0.174452,-0.250959,0.013102,-0.72028,-0.839384,0.428789,-0.575384,0.271931,0.039657,0.689879
1,0.99615,-0.03518,0.14827,0.10984,0.319888,-0.135079,0.572711,-0.007515,-0.169387,-0.065578,...,-0.342145,-0.175748,-0.329169,-0.38017,-0.275613,-0.293082,-0.703566,0.427364,-0.377466,0.998981
2,-0.046696,-0.156884,0.145566,0.240592,0.047457,0.111323,-0.675762,-0.219875,0.263582,-0.324753,...,0.626799,-0.090546,0.241027,0.074701,0.174568,0.020676,0.337071,-0.140033,-0.096334,-0.118992
3,-0.29027,-0.367704,-0.098322,-0.260665,-0.219107,0.580961,0.344901,-0.437576,0.536663,0.268927,...,-0.095163,-0.410139,0.019402,-0.42746,-0.244651,-0.108574,-0.184841,0.215328,0.509272,-0.233955
4,-0.974812,-0.194855,-0.282236,-0.13011,-0.262638,0.086012,0.493479,0.023571,0.298599,-0.553502,...,-0.117912,0.397806,-0.207,0.214214,0.226637,0.072036,-0.015147,-0.174736,-0.434278,-0.35991
5,0.113237,0.55573,0.548589,0.174545,-0.260274,0.224632,0.141135,-0.089211,0.19357,0.367066,...,0.089429,0.441272,-0.011516,0.068045,0.249512,-0.116852,-0.194419,-0.100981,0.220634,-0.286557
6,-0.594578,-0.183821,-0.215167,-0.020312,-0.002152,-0.437948,-0.354487,-0.670162,-0.186722,-0.515711,...,0.368161,0.062455,0.629697,0.035124,0.315056,0.411734,0.225164,-0.225102,-0.269512,0.091287
7,-0.337723,0.164741,0.558329,-0.395525,-0.115711,-0.600306,0.533303,0.61699,0.239532,0.080527,...,-0.027793,0.456749,-0.534784,-0.270104,-0.726306,-0.281948,-0.162949,-0.378135,-0.54559,0.16418
8,0.018777,0.297037,-0.257076,-0.103755,0.113057,-0.832684,0.211413,0.300517,-0.287113,-0.027294,...,0.044454,0.019713,0.008614,-0.485322,-0.236123,-0.273027,0.319627,0.505591,-0.3441,0.02436
9,-0.41724,-0.382403,0.03717,-0.246039,0.205087,-0.405075,0.334612,0.040824,-0.01217,-0.080034,...,0.037547,-0.284791,-0.365411,-0.268734,0.321538,0.267451,0.438465,-0.056473,-0.139085,0.124318


In [27]:
ent_Y = ent['Unnamed: 0']
ent_X = ent[feature_cols]
ent_X
# ent_Y.unique()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.280981,-0.049617,-0.003215,-0.483546,-0.123656,0.564292,0.432948,0.215064,-0.155113,-0.140990,...,0.127948,-0.174766,-0.222745,-0.103216,-0.015835,-0.207434,-0.335994,-0.205721,0.166335,-0.027663
1,0.259497,-0.124253,0.516882,-0.507972,0.049873,-0.423409,0.301144,0.041155,-0.204702,0.148390,...,-0.203218,0.163124,-0.267435,-0.214824,0.091249,-0.208819,-0.376532,-0.185556,0.006606,0.095279
2,0.030401,-0.380870,0.387651,-0.021696,0.417615,0.254370,0.214374,0.199189,-0.328048,-0.067366,...,-0.338145,-0.275059,0.547209,-0.196628,-0.119073,-0.189195,-0.154224,-0.347735,-0.346463,-0.072465
3,0.162839,0.023307,-0.004272,0.258616,0.182640,0.296118,0.342320,0.231499,-0.152109,0.106818,...,-0.249842,0.077031,-0.241548,-0.195538,-0.170217,-0.172525,-0.368295,-0.326116,-0.198746,0.095087
4,0.280026,0.329111,0.351390,-0.206114,0.151328,0.318364,0.217821,-0.110135,0.014867,0.063398,...,-0.103858,-0.237071,-0.178284,-0.067466,-0.203515,-0.179597,-0.097360,-0.358156,0.012783,0.263368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,0.224218,-0.254946,0.285251,-0.331004,0.101589,0.181198,0.022742,0.090943,-0.090416,0.217867,...,-0.327105,-0.339009,-0.265576,-0.364906,-0.266474,-0.089858,-0.124078,-0.275570,-0.084012,0.213622
137,-0.008934,-0.033354,-0.210881,-0.478378,0.643270,0.283869,0.146306,0.223727,-0.309897,0.144773,...,0.358221,0.032059,0.303564,-0.284214,0.098552,-0.455931,-0.310691,-0.460172,-0.052459,0.268468
138,0.331385,0.029724,0.063662,-0.267404,0.271960,0.220780,-0.054474,-0.063249,-0.174909,0.776858,...,-0.367935,-0.264612,-0.365145,-0.430507,-0.126362,0.058341,-0.009460,-0.311422,-0.059871,0.167609
139,0.308189,0.348414,0.259391,-0.259898,0.351049,0.225959,0.089048,-0.035626,-0.113165,-0.272907,...,-0.378605,-0.381572,-0.243346,-0.208718,0.033530,-0.512788,-0.173183,-0.147759,-0.228199,0.342012


In [28]:
# Create Dataframe of Expressions, individuals and positive and negative examples
columns = ['Expression', 'Individual', 'Example Type']
df = pd.DataFrame(columns=columns)

In [29]:
# Read, a portion of the dataset

# with open('Data.json', 'r') as f:
#     data = json.load(f)
# data
# # filepath = '/home/student117/Documents/aims/project/NCESData/animals/training_data/Data.json'
# # data = json.loads(filepath)

# # df = pd.read_json(filepath)
# # df.head()
# # for expr, val in data.items():
# #     print(expr, val)
#     # for exam in expr:
#     #     print(exam)
# for expr in data:
#     # print(data[expr])
#     i = 0
#     for exam_type in data[expr]:
#         # print(exam_type)
#         i+=1
#         if i == 5:
#             break
#         j = 0
#         for example in data[expr][exam_type]:
#             j+=1
#             if j == 3:
#                 break
#             # print(example)
#             # if exam_type == "positive examples":
#                 # print(data[expr][exam_type])
#             k = 0
#             for ind in data[expr][exam_type]:
#                 k+=1
#                 if k == 5:
#                     break
#                     # 'Expression', 'individual', 'Example Type'
#                 row = {'Expression': expr, 'Example Type': exam_type, 'Individual': ind}
#                 # print(row)
#                 df = df._append(row, ignore_index=True)
#                 # print(row)



In [30]:
# Read whole dataset, takes longer than reading portion of dataset

# for expr in data:
#     # print(data[expr])
#     for exam_type in data[expr]:

#         for example in data[expr][exam_type]:

#             for ind in data[expr][exam_type]:
#                 row = {'Expression': expr, 'Example Type': exam_type, 'Individual': ind}
#                 # print(row)
#                 df = df._append(row, ignore_index=True)
#                 # print(row)


In [10]:
# Save data for future use

# df_all.to_csv('data_all.csv', index=False)
# df_all.shape
# df.shape
df = pd.read_csv('data_all.csv') # uncomment if data already saved
df.head()

Unnamed: 0,Expression,Individual,Example Type
0,Cat,animals#cat01,positive examples
1,Cat,animals#boy01,negative examples
2,Cat,animals#lizard01,negative examples
3,Cat,animals#snake01,negative examples
4,Cat,animals#trout01,negative examples


In [11]:
print(df.head(20))
print(df['Example Type'].unique())
print(df.shape)
df['Example Type'].unique()

   Expression          Individual       Example Type
0         Cat       animals#cat01  positive examples
1         Cat       animals#boy01  negative examples
2         Cat    animals#lizard01  negative examples
3         Cat     animals#snake01  negative examples
4         Cat     animals#trout01  negative examples
5         Cat       animals#bat01  negative examples
6         Cat     animals#eagle01  negative examples
7         Cat      animals#trex01  negative examples
8         Cat   animals#ostrich01  negative examples
9         Cat       animals#eel01  negative examples
10        Cat       animals#dog01  negative examples
11        Cat    animals#dragon01  negative examples
12        Cat      animals#girl01  negative examples
13        Cat    animals#turtle01  negative examples
14        Cat     animals#shark01  negative examples
15        Cat   animals#dolphin01  negative examples
16        Cat   animals#herring01  negative examples
17        Cat     animals#croco01  negative ex

array(['positive examples', 'negative examples'], dtype=object)

In [12]:
# Encode positive and negative examples

df['Example Encoding'] = df['Example Type'].map({'positive examples': 1, 'negative examples': 0})
df.sample(10)
df[df['Expression'] == 'Dog ⊔ Platypus']

Unnamed: 0,Expression,Individual,Example Type,Example Encoding
73050,Dog ⊔ Platypus,animals#platypus01,positive examples,1
73051,Dog ⊔ Platypus,animals#dog01,positive examples,1
73052,Dog ⊔ Platypus,animals#platypus01,positive examples,1
73053,Dog ⊔ Platypus,animals#dog01,positive examples,1
73054,Dog ⊔ Platypus,animals#eagle01,negative examples,0
...,...,...,...,...
73373,Dog ⊔ Platypus,animals#bat01,negative examples,0
73374,Dog ⊔ Platypus,animals#girl01,negative examples,0
73375,Dog ⊔ Platypus,animals#dragon01,negative examples,0
73376,Dog ⊔ Platypus,animals#boy01,negative examples,0


In [13]:
# Extract atomic classes names from dataset

constructors = ['¬', '⊔', '⊓']
# i = 0
# atomic_classes = []
# for expr in data:
#   atomic_classes.append(expr)
#   # print(expr)
#   i+=1
#   if i == 24:
#     break
atomic_classes = ['Homeothermic', 'HasEggs', 'HasMilk', 'HasGills', 'Eel', 'Ostrich', 'Trout', 'Crocodile', 'Girl', 'Lizard', 'Shark', 'Bat', 'Cat', 'Dolphin', 'Eagle', 'Dog', 'Snake', 'Herring', 'Boy', 'Turtle', 'Penguin', 'T-Rex', 'Platypus', 'Dragon']


In [14]:
# Extract atomic classes for training

atomic = df[df['Expression'].isin(atomic_classes)]
atomic

Unnamed: 0,Expression,Individual,Example Type,Example Encoding
0,Cat,animals#cat01,positive examples,1
1,Cat,animals#boy01,negative examples,0
2,Cat,animals#lizard01,negative examples,0
3,Cat,animals#snake01,negative examples,0
4,Cat,animals#trout01,negative examples,0
...,...,...,...,...
8183,Dog,animals#girl01,negative examples,0
8184,Dog,animals#cat01,negative examples,0
8185,Dog,animals#shark01,negative examples,0
8186,Dog,animals#croco01,negative examples,0


In [19]:
# merge classes with their embdenings

atomic_ent = pd.merge(atomic, ent, left_on='Individual', right_on='Unnamed: 0', how='inner')
# atomic_ent.columns
atomic_ent = pd.merge(atomic_ent, ent, left_on="Expression", right_on='Unnamed: 0', how='inner', suffixes=['_ind', '_cla'])
# a.head()

In [20]:
# atomic_data = atomic_ent[['Expression', 'Individual', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', 'Example Encoding']]
atomic_data = atomic_ent[['Expression', 'Individual', '0_ind', '1_ind', '2_ind', '3_ind', '4_ind', '5_ind', '6_ind', '7_ind', '8_ind', '9_ind', '10_ind', '11_ind', '12_ind', '13_ind', '14_ind', '15_ind', '16_ind', '17_ind', '18_ind', '19_ind', '20_ind', '21_ind', '22_ind', '23_ind', '24_ind', '25_ind', \
                          '26_ind', '27_ind', '28_ind', '29_ind', '30_ind', '31_ind', '32_ind', '33_ind', '34_ind', '35_ind', '36_ind', '37_ind', '38_ind', '39_ind', '0_cla', '1_cla', '2_cla', '3_cla', '4_cla', '5_cla', '6_cla', '7_cla', '8_cla', '9_cla', '10_cla', '11_cla', '12_cla', '13_cla', '14_cla', '15_cla', '16_cla', '17_cla', '18_cla', '19_cla', '20_cla', '21_cla', '22_cla', '23_cla', '24_cla', '25_cla', '26_cla', '27_cla', '28_cla', '29_cla', '30_cla', '31_cla', '32_cla', '33_cla', '34_cla', '35_cla', '36_cla', '37_cla', '38_cla', '39_cla', 'Example Encoding']]

atomic_data

Unnamed: 0,Expression,Individual,0_ind,1_ind,2_ind,3_ind,4_ind,5_ind,6_ind,7_ind,...,31_cla,32_cla,33_cla,34_cla,35_cla,36_cla,37_cla,38_cla,39_cla,Example Encoding
0,Cat,animals#cat01,0.183570,0.064570,0.119314,-0.220391,0.185930,0.059797,0.320332,0.277950,...,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329,1
1,Cat,animals#boy01,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,...,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329,0
2,Cat,animals#boy01,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,...,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329,0
3,Cat,animals#boy01,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,...,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329,0
4,Cat,animals#boy01,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,...,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8183,Dog,animals#penguin01,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,...,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478,0
8184,Dog,animals#penguin01,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,...,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478,0
8185,Dog,animals#penguin01,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,...,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478,0
8186,Dog,animals#penguin01,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,...,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478,0


In [21]:
# # Create Dictionary to encode classes
# atomic_data['Expression'].value_counts()
# category_codes = {
#     'Homeothermic': 0,
#     'HasEggs': 1,
#     'HasMilk': 2,
#     'HasGills': 3,
#     'Eel': 4,
#     'Ostrich': 5,
#     'Trout': 6,
#     'Crocodile': 7,
#     'Girl': 8,
#     'Lizard': 9,
#     'Shark': 10,
#     'Bat': 11,
#     'Cat': 12,
#     'Dolphin': 13,
#     'Eagle': 14,
#     'Dog': 15,
#     'Snake': 16,
#     'Herring': 17,
#     'Boy': 18,
#     'Turtle': 19,
#     'Penguin': 20,
#     'T-Rex': 21,
#     'Platypus': 22,
#     'Dragon':23
# }

In [22]:
# # Encode Expressions with hierachy
# atomic_data['Expression_Encoded'] = atomic_data['Expression']

# atomic_data = atomic_data.replace({'Expression_Encoded': category_codes})
# atomic_data.head()

In [23]:
# labels_codes = atomic_data[['Expression', 'Expression_Encoded']].drop_duplicates().sort_values('Expression')
# print(labels_codes)

In [25]:
# X = adc[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', 'Homeothermic', 'HasEggs', 'HasMilk', 'HasGills', 'Eel', 'Ostrich', 'Trout', 'Crocodile', 'Girl', 'Lizard', 'Shark', 'Bat', 'Cat', 'Dolphin', 'Eagle', 'Dog', 'Snake', 'Herring', 'Boy', 'Turtle', 'Penguin', 'T-Rex', 'Platypus', 'Dragon']]
# X = adc[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39']]
X = atomic_data[['0_ind', '1_ind', '2_ind', '3_ind', '4_ind', '5_ind', '6_ind', '7_ind', '8_ind', '9_ind', '10_ind', '11_ind', '12_ind', '13_ind', '14_ind', '15_ind', '16_ind', '17_ind', '18_ind', '19_ind', '20_ind', '21_ind', '22_ind', '23_ind', '24_ind', '25_ind', '26_ind', '27_ind', '28_ind', '29_ind', '30_ind', '31_ind', '32_ind', '33_ind', '34_ind', '35_ind', '36_ind', '37_ind', '38_ind', '39_ind', '0_cla', '1_cla', '2_cla', '3_cla', '4_cla', '5_cla', '6_cla', '7_cla', '8_cla', '9_cla', '10_cla', '11_cla', '12_cla', '13_cla', '14_cla', '15_cla', '16_cla', '17_cla', '18_cla', '19_cla', '20_cla', '21_cla', '22_cla', '23_cla', '24_cla', '25_cla', '26_cla', '27_cla', '28_cla', '29_cla', '30_cla', '31_cla', '32_cla', '33_cla', '34_cla', '35_cla', '36_cla', '37_cla', '38_cla', '39_cla']]
y = atomic_data['Example Encoding']
X#.shape
# y

Unnamed: 0,0_ind,1_ind,2_ind,3_ind,4_ind,5_ind,6_ind,7_ind,8_ind,9_ind,...,30_cla,31_cla,32_cla,33_cla,34_cla,35_cla,36_cla,37_cla,38_cla,39_cla
0,0.183570,0.064570,0.119314,-0.220391,0.185930,0.059797,0.320332,0.277950,-0.172449,-0.078276,...,0.190652,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329
1,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,-0.082587,0.207627,...,0.190652,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329
2,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,-0.082587,0.207627,...,0.190652,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329
3,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,-0.082587,0.207627,...,0.190652,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329
4,0.326895,-0.017734,0.072276,-0.148582,0.154523,0.143247,0.258985,0.121672,-0.082587,0.207627,...,0.190652,-0.317788,-0.006029,-0.285176,0.176120,-0.070169,-0.269090,-0.445585,-0.012556,0.332329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8183,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,-0.009941,-0.133394,...,-0.297430,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478
8184,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,-0.009941,-0.133394,...,-0.297430,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478
8185,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,-0.009941,-0.133394,...,-0.297430,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478
8186,0.251046,-0.165633,0.311441,-0.241266,0.215875,0.208570,0.316402,0.122711,-0.009941,-0.133394,...,-0.297430,0.169880,0.206478,-0.285225,-0.196411,-0.136789,-0.248808,-0.360837,-0.111585,0.350478


In [26]:
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

In [51]:
# Hyperparameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

X_train_tensor = torch.reshape(X_train_tensor, (X_train_tensor.shape[0], 1, X_train_tensor.shape[1]))
X_test_tensor = torch.reshape(X_test_tensor, (X_test_tensor.shape[0], 1, X_test_tensor.shape[1]))
y_train_tensor = torch.reshape(y_train_tensor, (y_train_tensor.shape[0], 1, y_train_tensor.shape[1]))
y_test_tensor = torch.reshape(y_test_tensor, (y_test_tensor.shape[0], 1, y_test_tensor.shape[1]))


# Define hyperparameters
input_size = X_train.shape[1]
hidden_size = 60
output_size = 2 # num_classes = 2
num_layers = 2
learning_rate = 0.1
num_epochs = 100
batch_size = 50


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train, dtype=torch.long)
  y_test_tensor = torch.tensor(y_test, dtype=torch.long)


In [52]:
# X_train_tensor.shape
print("Training Shape", X_train_tensor.shape, y_train_tensor.shape)
print("Testing Shape", X_test_tensor.shape, y_test_tensor.shape) 

Training Shape torch.Size([6550, 1, 80]) torch.Size([6550, 1, 1])
Testing Shape torch.Size([1638, 1, 80]) torch.Size([1638, 1, 1])


In [57]:
# Define LSTM classifier model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out



In [58]:
# Instantiate the model
model = LSTMClassifier(input_size, hidden_size, num_layers, output_size, batch_size)

In [59]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [63]:
# Train the model
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, torch.flatten(y_train_tensor))
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


Epoch [1/100], Loss: 0.6987351775169373
Epoch [2/100], Loss: 0.1674080491065979
Epoch [3/100], Loss: 0.33024758100509644
Epoch [4/100], Loss: 0.18941493332386017
Epoch [5/100], Loss: 0.2637271285057068
Epoch [6/100], Loss: 0.27955102920532227
Epoch [7/100], Loss: 0.2606293857097626
Epoch [8/100], Loss: 0.22268043458461761
Epoch [9/100], Loss: 0.17884542047977448
Epoch [10/100], Loss: 0.14665432274341583
Epoch [11/100], Loss: 0.15706202387809753
Epoch [12/100], Loss: 0.18386386334896088
Epoch [13/100], Loss: 0.15664735436439514
Epoch [14/100], Loss: 0.14039823412895203
Epoch [15/100], Loss: 0.14072385430335999
Epoch [16/100], Loss: 0.14512871205806732
Epoch [17/100], Loss: 0.14768390357494354
Epoch [18/100], Loss: 0.1463327258825302
Epoch [19/100], Loss: 0.14074283838272095
Epoch [20/100], Loss: 0.131633922457695
Epoch [21/100], Loss: 0.12086201459169388
Epoch [22/100], Loss: 0.11166840046644211
Epoch [23/100], Loss: 0.10785820335149765
Epoch [24/100], Loss: 0.10647306591272354
Epoch [2