In [243]:
# Libraries for handling the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# For preparing the data to use in LSTM  
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential,load_model
from keras.layers import LSTM, Dense, Dropout
from scikeras.wrappers import KerasRegressor

import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly import offline

import joblib

### Loading datasets

In [206]:
# Loading datasets 
# Note: csv files can be found on pCloud
Science1900_2023 = pd.read_csv('./data/Science1900_2023.csv', index_col=0)  
topic_proba_data_Science1900_2023 = pd.read_csv('./data/topic_proba_data_Science1900_2023.csv',index_col=0)  

topified_vectorized_Science1900_2023 = pd.read_csv('./data/topified_vectorized_Science1900_2023.csv', index_col=0)  
topified_vectorized_Science1900_2023['fieldsOfStudy']= Science1900_2023['fieldsOfStudy']
topified_vectorized_Science1900_2023['s2FieldsOfStudy']= Science1900_2023['s2FieldsOfStudy']
topified_vectorized_Science1900_2023 = topified_vectorized_Science1900_2023.drop(['title','abstract','paperId','externalIds', 'publicationVenue', 'publicationTypes'], axis=1)
topified_vectorized_Science1900_2023['publicationDate'] = pd.to_datetime(topified_vectorized_Science1900_2023['publicationDate']) 
topified_vectorized_Science1900_2023['year'] = pd.to_datetime(topified_vectorized_Science1900_2023['year'], format ='%Y') 

topic_labels_df = pd.read_csv('./data/topic_labels.csv')

### Extracting dataframes

In [229]:
# Extracting/Creating topics into a list to use it in the model
train_percentage = 0.8  # 80% for training, 20% for testing

number_of_topics_to_analyse = len(topic_labels_df) # For debug purposes set to three

list_of_dataframes_for_all_topics = []
list_of_scalers_for_citation_count = []
list_of_scalers_for_cumulative_citation_count = []
list_of_train_data_for_topics = []
list_of_test_data_for_topics = []

grouped_by_all_topics = topified_vectorized_Science1900_2023.groupby('topic_code')

for i in range(number_of_topics_to_analyse):

    list_of_scalers_for_citation_count.append(MinMaxScaler(feature_range=(0, 1)))
    list_of_scalers_for_cumulative_citation_count.append(MinMaxScaler(feature_range=(0, 1)))

    # Getting the first topic from the group
    temp_df = grouped_by_all_topics.get_group(i)
    temp_df['publicationDate'].fillna(temp_df['year'].apply(lambda x:x), inplace=True)
    temp_df.drop('year', axis=1,inplace=True)
    temp_df= temp_df.groupby('publicationDate').sum('citationCount')
    temp_df['citationCount_scaled'] = list_of_scalers_for_citation_count[i].fit_transform(temp_df[['citationCount']])
    temp_df['cumulative_citationCount'] = temp_df['citationCount'].cumsum()
    temp_df['cumulative_citationCount_scaled'] = list_of_scalers_for_cumulative_citation_count[i].fit_transform(temp_df[['cumulative_citationCount']])
    list_of_dataframes_for_all_topics.append(temp_df)

    # train / test split
    # list_of_train_data_for_topics.append(temp_df.query('publicationDate < "2015-01-01"').reset_index(drop = False))
    # list_of_test_data_for_topics.append(temp_df.query('publicationDate >= "2015-01-01"').reset_index(drop = False))
    # Sum clusters doesnt have enough test data, instead of date, precentage is used
    
    # # Calculate the split index based on the percentage
    temp_df = temp_df.reset_index(drop = False)
    split_index = int(len(temp_df) * train_percentage)
    
    # # # Split the data into training and testing based on the split index
    train_data = temp_df.iloc[:split_index].reset_index(drop=True)
    test_data = temp_df.iloc[split_index:].reset_index(drop=True)
    
    list_of_train_data_for_topics.append(train_data)
    list_of_test_data_for_topics.append(test_data)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

### Citation Count Prediction

In [132]:
# Citation Count Prediction

# Variables:
look_back = 20
sampling_rate = 1
stride = 1
batch_size = 10
neurons = 4
epochs = 20

list_of_citation_count_predictions_scaled = []
list_of_citation_count_predictions = []
list_of_cumulative_citation_count_predictions_scaled = []
list_of_cumulative_citation_count_predictions = []

# for i in range(len(topic_labels_df)):
for i in range(number_of_topics_to_analyse):
    print('Training topic number: ',i)
    # Training the LSTM - time series for a single topic
    n_features = 1

    X_train = list_of_train_data_for_topics[i].publicationDate
    y_train = list_of_train_data_for_topics[i]['citationCount_scaled']

    X_test = list_of_test_data_for_topics[i].publicationDate
    y_test = list_of_test_data_for_topics[i]['citationCount_scaled']

    train_series = y_train.values.reshape((len(y_train), n_features))
    test_series  = y_test.values.reshape((len(y_test), n_features))

    train_generator = TimeseriesGenerator(train_series, train_series,
                                        length        = look_back, 
                                        sampling_rate = sampling_rate,
                                        stride        = stride,
                                        batch_size    = batch_size)

    test_generator = TimeseriesGenerator(test_series, test_series,
                                        length        = look_back, 
                                        sampling_rate = sampling_rate,
                                        stride        = stride,
                                        batch_size    = batch_size)

    model = Sequential()
    model.add(LSTM(neurons, return_sequences=True, input_shape=(look_back, n_features)))
    model.add(Dropout(0.2))
    model.add(LSTM(neurons,return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(neurons))
    model.add(Dropout(0.2))
    model.add(Dense(1))  # Modify the output layer as needed
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(train_generator, epochs=epochs,verbose=0)
    list_of_citation_count_predictions_scaled.append(pd.DataFrame(model.predict(test_generator), columns=['citationCount']))
    list_of_citation_count_predictions.append(pd.DataFrame(list_of_scalers_for_citation_count[i].inverse_transform(list_of_citation_count_predictions_scaled[i]), columns=['citationCount']))

Training topic number:  0
Training topic number:  1
Training topic number:  2
Training topic number:  3


In [133]:
# Plots for Citation Count prediction for topics
# plotly colors: https://plotly.com/python/discrete-color/
# Calculate the number of rows and columns
num_rows = (number_of_topics_to_analyse + 1) // 2  # Ensure at least 1 row
num_cols = 2

# Create a subplot
fig = make_subplots(rows=num_rows, 
                    cols=num_cols, 
                    subplot_titles=[topic_labels_df['topic_labels'][i].replace('_', ' ').replace(' ', ': ', 1)[:31] for i in range(1, number_of_topics_to_analyse + 1)],
                    )

# Create two separate legends for two specific traces
legend_trace_1 = go.Scatter(x=[None], y=[None], mode='lines', name='test data', line=dict(color='#00CC96'))
legend_trace_2 = go.Scatter(x=[None], y=[None], mode='lines', name='prediction', line=dict(color='#EF553C'))


for i in range(number_of_topics_to_analyse):

    trace_1 = go.Scatter(x=list_of_test_data_for_topics[i].publicationDate, 
                    y=list_of_test_data_for_topics[i]['citationCount'], mode='lines', 
                    name='test data',
                    line=dict(color='#00CC96'),
                    showlegend=False,  # Set showlegend to True
                    )


    trace_2 = go.Scatter(x=list_of_test_data_for_topics[i].publicationDate[20:], 
                        y=list_of_citation_count_predictions[i]['citationCount'], mode='lines', 
                        name='prediction',
                        line=dict(color='#EF553C'),
                        showlegend=False,  # Set showlegend to True
                        )
    
    row = i // num_cols + 1
    col = i % num_cols + 1
    
    # Add both line plots to the subplot
    fig.add_trace(trace_1, row=row, col=col)
    fig.add_trace(trace_2, row=row, col=col)  

# Update layout
fig.update_layout(   
                  title_text='Citation count predictions',
                  paper_bgcolor='black',  # Set the background color to black
                  plot_bgcolor='black',  # Set the plot area background color to black
                  font=dict(color="#f2f2f2"),  # Set font color to white)
                  legend=dict(orientation='v', y=1.3, x=0.85),  # Position legend at the top right corner
)
# Add the two separate legend entries at the top right corner
fig.add_trace(legend_trace_1, row=1, col=2)
fig.add_trace(legend_trace_2, row=1, col=2)

# Set the font size for subplot titles
fig.update_annotations(font_size=14)
# Show the plot
fig.show()

### Cumulative Citation Count Prediction

In [230]:
# Cumulative Citation Count Prediction

# Variables:
epochs = 20
look_back = 20
neurons = 4
batch_size = 10
sampling_rate = 1
stride = 1
dropout =0

list_of_cumulative_citation_count_predictions_scaled = []
list_of_cumulative_citation_count_predictions = []

list_of_training_losses = []  # To store training losses
list_of_training_accuracies = []  # To store training accuracies

# for i in range(len(topic_labels_df)):
for i in range(number_of_topics_to_analyse):
    print('Training topic number: ',i)
    # Training the LSTM - time series for a single topic
    n_features = 1

    X_train = list_of_train_data_for_topics[i].publicationDate
    y_train = list_of_train_data_for_topics[i]['cumulative_citationCount_scaled']

    X_test = list_of_test_data_for_topics[i].publicationDate
    y_test = list_of_test_data_for_topics[i]['cumulative_citationCount_scaled']

    train_series = y_train.values.reshape((len(y_train), n_features))
    test_series  = y_test.values.reshape((len(y_test), n_features))


    # required if splitting results in a data set which is smaller than look back
    if len(list_of_test_data_for_topics[i]['cumulative_citationCount_scaled']) < look_back:
        look_back=int(len(list_of_test_data_for_topics[i]['cumulative_citationCount_scaled'])/2)

    train_generator = TimeseriesGenerator(train_series, train_series,
                                        length        = look_back, 
                                        sampling_rate = sampling_rate,
                                        stride        = stride,
                                        batch_size    = batch_size)

    test_generator = TimeseriesGenerator(test_series, test_series,
                                        length        = look_back, 
                                        sampling_rate = sampling_rate,
                                        stride        = stride,
                                        batch_size    = batch_size)

    model = Sequential()
    model.add(LSTM(neurons, return_sequences = True, input_shape=(look_back, n_features)))
    model.add(Dropout(dropout))
    model.add(LSTM(neurons, return_sequences = True))
    model.add(Dropout(dropout))
    model.add(LSTM(neurons))
    model.add(Dropout(dropout))
    model.add(Dense(1))  # Modify the output layer as needed
    model.compile(optimizer='adam', loss='mean_squared_error')
    history = model.fit(train_generator, epochs=epochs,verbose=0)
    # Get the training loss and accuracy
    training_loss = history.history['loss']
    training_accuracy = 1 - np.array(training_loss)
    
    # list_of_training_losses.append(training_loss)
    # list_of_training_accuracies.append(training_accuracy)
    # list_of_cumulative_citation_count_predictions_scaled.append(pd.DataFrame(model.predict(test_generator), columns=['cumulative_citationCount']))
    # list_of_cumulative_citation_count_predictions.append(pd.DataFrame(list_of_scalers_for_cumulative_citation_count[i].inverse_transform(list_of_cumulative_citation_count_predictions_scaled[i]), columns=['cumulative_citationCount']))
    cumulative_citation_count_predictions_scaled = pd.DataFrame(model.predict(test_generator), columns=['cumulative_citationCount'])
    cumulative_citation_count_predictions = pd.DataFrame(list_of_scalers_for_cumulative_citation_count[i].inverse_transform(cumulative_citation_count_predictions_scaled), columns=['cumulative_citationCount'])
    joblib.dump(cumulative_citation_count_predictions_scaled, f"./data/lstm/predictions/Cumulative_Citation_Count_Prediction_Scaled_Topic_{i}.csv")
    joblib.dump(cumulative_citation_count_predictions, f"./data/lstm/predictions/Cumulative_Citation_Count_Prediction_Topic_{i}.csv")
    joblib.dump(training_loss, f"./data/lstm/training_evaluation/training_loss_{i}.csv")
    joblib.dump(training_accuracy, f"./data/lstm/training_evaluation/training_accuracy_{i}.csv")
    model.save(f"./data/lstm/models/Cumulative_Citation_Count_Model_Topic_{i}.model")


Training topic number:  0
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_0.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_0.model/assets


Training topic number:  1
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_1.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_1.model/assets


Training topic number:  2
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_2.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_2.model/assets


Training topic number:  3
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_3.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_3.model/assets


Training topic number:  4
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_4.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_4.model/assets


Training topic number:  5
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_5.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_5.model/assets


Training topic number:  6
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_6.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_6.model/assets


Training topic number:  7
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_7.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_7.model/assets


Training topic number:  8
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_8.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_8.model/assets


Training topic number:  9
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_9.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_9.model/assets


Training topic number:  10
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_10.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_10.model/assets


Training topic number:  11
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_11.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_11.model/assets


Training topic number:  12
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_12.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_12.model/assets


Training topic number:  13
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_13.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_13.model/assets


Training topic number:  14
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_14.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_14.model/assets


Training topic number:  15
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_15.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_15.model/assets


Training topic number:  16
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_16.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_16.model/assets


Training topic number:  17
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_17.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_17.model/assets


Training topic number:  18
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_18.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_18.model/assets


Training topic number:  19
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_19.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_19.model/assets


Training topic number:  20
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_20.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_20.model/assets


Training topic number:  21
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_21.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_21.model/assets


Training topic number:  22
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_22.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_22.model/assets


Training topic number:  23
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_23.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_23.model/assets


Training topic number:  24
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_24.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_24.model/assets


Training topic number:  25
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_25.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_25.model/assets


Training topic number:  26
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_26.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_26.model/assets


Training topic number:  27
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_27.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_27.model/assets


Training topic number:  28
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_28.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_28.model/assets


Training topic number:  29
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_29.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_29.model/assets


Training topic number:  30
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_30.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_30.model/assets


Training topic number:  31
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_31.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_31.model/assets


Training topic number:  32
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_32.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_32.model/assets


Training topic number:  33
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_33.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_33.model/assets


Training topic number:  34
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_34.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_34.model/assets


Training topic number:  35
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_35.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_35.model/assets


Training topic number:  36
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_36.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_36.model/assets


Training topic number:  37
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_37.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_37.model/assets


Training topic number:  38
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_38.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_38.model/assets


Training topic number:  39
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_39.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_39.model/assets


Training topic number:  40
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_40.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_40.model/assets


Training topic number:  41
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_41.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_41.model/assets


Training topic number:  42
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_42.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_42.model/assets


Training topic number:  43
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_43.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_43.model/assets


Training topic number:  44
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_44.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_44.model/assets


Training topic number:  45
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_45.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_45.model/assets


Training topic number:  46
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_46.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_46.model/assets


Training topic number:  47
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_47.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_47.model/assets


Training topic number:  48
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_48.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_48.model/assets


Training topic number:  49
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_49.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_49.model/assets


Training topic number:  50
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_50.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_50.model/assets


Training topic number:  51
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_51.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_51.model/assets


Training topic number:  52
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_52.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_52.model/assets


Training topic number:  53
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_53.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_53.model/assets


Training topic number:  54
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_54.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_54.model/assets


Training topic number:  55
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_55.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_55.model/assets


Training topic number:  56
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_56.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_56.model/assets


Training topic number:  57
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_57.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_57.model/assets


Training topic number:  58
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_58.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_58.model/assets


Training topic number:  59
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_59.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_59.model/assets


Training topic number:  60
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_60.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_60.model/assets


Training topic number:  61
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_61.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_61.model/assets


Training topic number:  62
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_62.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_62.model/assets


Training topic number:  63
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_63.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_63.model/assets


Training topic number:  64
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_64.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_64.model/assets


Training topic number:  65
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_65.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_65.model/assets


Training topic number:  66
INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_66.model/assets


INFO:tensorflow:Assets written to: ./data/lstm/models/Cumulative_Citation_Count_Model_Topic_66.model/assets


Training topic number:  67


ValueError: `start_index+length=20 > end_index=17` is disallowed, as no part of the sequence would be left to be used as current step.

In [231]:
# Loading saved data and models
number_of_topics_to_plot = 66 #number_of_topics_to_analyse
list_of_cumulative_citation_count_predictions_scaled = []
list_of_cumulative_citation_count_predictions = []
list_of_models = []
list_of_training_losses= []
list_of_training_accuracies = []

for i in range(number_of_topics_to_plot):
    # Load the scaled predictions
    predictions_scaled = joblib.load(f"./data/lstm/predictions/Cumulative_Citation_Count_Prediction_Scaled_Topic_{i}.csv")
    list_of_cumulative_citation_count_predictions_scaled.append(predictions_scaled)

    # Load the test data
    predictions = joblib.load(f"./data/lstm/predictions/Cumulative_Citation_Count_Prediction_Topic_{i}.csv")
    list_of_cumulative_citation_count_predictions.append(predictions)

    # Load training losses data
    training_loss = joblib.load(f"./data/lstm/training_evaluation/training_loss_{i}.csv")
    list_of_training_losses.append(training_loss)
    
    # Load training losses data
    training_accuracy = joblib.load(f"./data/lstm/training_evaluation/training_accuracy_{i}.csv")
    list_of_training_accuracies.append(training_accuracy)
    
    # Load the trained model
    model = load_model(f"./data/lstm/models/Cumulative_Citation_Count_Model_Topic_{i}.model")
    list_of_models.append(model)

In [260]:
# Plots for Cumulative Citation Count prediction for topics
# Plots for Citation Count prediction for topics
# Calculate the number of rows and columns
figure_height = 2000
number_of_topics_to_plot=20
num_rows = (number_of_topics_to_plot + 1) // 2  # Ensure at least 1 row
num_cols = 2

# Create a subplot
fig = make_subplots(rows=num_rows, 
                    cols=num_cols, 
                    row_heights = [1] * num_rows,
                    column_widths= [1] * num_cols,
                    subplot_titles=[topic_labels_df['topic_labels'][i].replace('_', ' ').replace(' ', ': ', 1)[:31] for i in range(1, number_of_topics_to_plot + 1)],
                    )

# Create two separate legends for two specific traces
legend_trace_1 = go.Scatter(x=[None], y=[None], mode='lines', name='train data', line=dict(color='#636EFA'))
legend_trace_2 = go.Scatter(x=[None], y=[None], mode='lines', name='test data', line=dict(color='#00CC96'))
legend_trace_3 = go.Scatter(x=[None], y=[None], mode='lines', name='prediction', line=dict(color='#EF553C'))


for i in range(number_of_topics_to_plot):

    # train data
    trace_1 = go.Scatter(x=list_of_train_data_for_topics[i].publicationDate, 
                    y=list_of_train_data_for_topics[i]['cumulative_citationCount'], mode='lines', 
                    name='train data',
                    line=dict(color='#636EFA'),
                    showlegend=False,  # Set showlegend to True
                    )

    # test data
    trace_2 = go.Scatter(x=list_of_test_data_for_topics[i].publicationDate, 
                    y=list_of_test_data_for_topics[i]['cumulative_citationCount'], mode='lines', 
                    name='test data',
                    line=dict(color='#00CC96'),
                    showlegend=False,  # Set showlegend to True
                    )
    # prediction
    trace_3 = go.Scatter(x=list_of_test_data_for_topics[i].publicationDate[look_back:], 
                        y=list_of_cumulative_citation_count_predictions[i]['cumulative_citationCount'], mode='lines', 
                        name='prediction',
                        line=dict(color='#EF553C'),
                        showlegend=False,  # Set showlegend to True
                        )
    
    row = i // num_cols + 1
    col = i % num_cols + 1
    
    # Add both line plots to the subplot
    fig.add_trace(trace_1, row=row, col=col)
    fig.add_trace(trace_2, row=row, col=col)  
    fig.add_trace(trace_3, row=row, col=col)  

# Update layout
fig.update_layout(   
                  title_text='Cumulative citation count predictions',
                  paper_bgcolor='black',  # Set the background color to black
                  plot_bgcolor='black',  # Set the plot area background color to black
                  font=dict(color="#f2f2f2"),  # Set font color to white)
                  legend=dict(orientation='v', y=1.3, x=0.9),  # Position legend at the top right corner
                  height=figure_height
                  
)
# Add the two separate legend entries at the top right corner
fig.add_trace(legend_trace_1, row=1, col=2)
fig.add_trace(legend_trace_2, row=1, col=2)
fig.add_trace(legend_trace_3, row=1, col=2)

# Set the font size for subplot titles
fig.update_annotations(font_size=14)
# Show the plot
fig.show()

offline.plot(fig, filename='./data/lstm/plots/Cumulative_citation_count_predictions.html', auto_open=False)


'./data/lstm/plots/Cumulative_citation_count_predictions.html'

In [263]:
# Plots for Cumulative Citation Count prediction for topics
# Plots for Citation Count prediction for topics
# Calculate the number of rows and columns
num_rows = (number_of_topics_to_plot + 1) // 2  # Ensure at least 1 row
num_cols = 2

# Create a subplot
fig = make_subplots(rows=num_rows, 
                    cols=num_cols, 
                    subplot_titles=[topic_labels_df['topic_labels'][i].replace('_', ' ').replace(' ', ': ', 1)[:31] for i in range(1, number_of_topics_to_plot + 1)],
                    )

# Create two separate legends for two specific traces
legend_trace_1 = go.Scatter(x=[None], y=[None], mode='lines', name='training loss', line=dict(color='#EF553C'))
legend_trace_2 = go.Scatter(x=[None], y=[None], mode='lines', name='training accuracy', line=dict(color='#00CC96'))

for i in range(number_of_topics_to_plot):

    # Training loss (new trace)
    trace_1 = go.Scatter(
        x=list(range(epochs)),  # X-axis for training loss (e.g., epochs)
        y=list_of_training_losses[i],  # Y-axis for training loss
        mode='lines',
        name='training loss',
        line=dict(color='#EF553C'),
        showlegend=False,  # Set showlegend to True
        
    )
    # Training Accuracy (new trace)
    trace_2 = go.Scatter(
        x=list(range(epochs)),  # X-axis for training loss (e.g., epochs)
        y=list_of_training_accuracies[i],  # Y-axis for training loss
        mode='lines',
        name='training accuracy',
        line=dict(color='#00CC96'),
        showlegend=False,  # Set showlegend to True
    )
    
    row = i // num_cols + 1
    col = i % num_cols + 1
    
    # Add both line plots to the subplot
    # fig.add_trace(trace_1, row=row, col=col)
    fig.add_trace(trace_2, row=row, col=col)

# Update layout
fig.update_layout(   
                  title_text='Training Evaluation',
                  paper_bgcolor='black',  # Set the background color to black
                  plot_bgcolor='black',  # Set the plot area background color to black
                  font=dict(color="#f2f2f2"),  # Set font color to white)
                  legend=dict(orientation='v', y=1.3, x=0.9),  # Position legend at the top right corner
                  height = figure_height
)
# Add the two separate legend entries at the top right corner
fig.add_trace(legend_trace_1, row=1, col=2)

# Set the font size for subplot titles
fig.update_annotations(font_size=14)
# Show the plot
fig.show()

In [184]:
list_of_training_accuracies[0][0]

0.07896538078784943

In [185]:
list_of_training_losses[0][0]

0.07896538078784943