In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

# Load the dataset (assuming you've already loaded it)
df = pd.read_csv('clean_topified_vectorized_Science1900_2023.csv')  

# Step 1: Data Preparation
# Clean the data, handle missing values, convert categorical variables if needed.
print("\nStep 2: Data Cleaning")
# Check for missing values
print("Missing values:\n", df.isnull().sum())
# Remove duplicates
df.drop_duplicates(inplace=True)

df.drop(['Unnamed: 0','paperId','externalIds'], axis=1)



Step 2: Data Cleaning
Missing values:
 Unnamed: 0                  0
paperId                     0
externalIds                 0
publicationVenue            0
title                       0
abstract                    0
year                        0
referenceCount              0
citationCount               0
influentialCitationCount    0
fieldsOfStudy               0
s2FieldsOfStudy             0
publicationTypes            0
publicationDate             0
title_abstract              0
topic_code                  0
topic_list                  0
x_vector                    0
y_vector                    0
z_vector                    0
title_word_count            0
title_length                0
abstract_word_count         0
abstract_length             0
title_tokens                0
abstract_tokens             0
dtype: int64


Unnamed: 0,publicationVenue,title,abstract,year,referenceCount,citationCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,...,topic_list,x_vector,y_vector,z_vector,title_word_count,title_length,abstract_word_count,abstract_length,title_tokens,abstract_tokens
0,"{'id': 'a3a6c306-5d4a-48fd-9054-2cd9277bf956',...",Acoustical characteristics of tinnitus. An ana...,The recognition of tinnitus aurium as a proble...,1962,5,69,1,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,20_hearing_ear_auditory_hearing loss,2.364143,5.038495,3.597319,8,6,159,150,"['acoustical', 'characteristics', 'tinnitus', ...","['recognition', 'tinnitus', 'aurium', 'problem..."
1,"{'id': 'bade12e5-f20e-4b51-83a6-534e3705569a',...",Reproductive steroids in the bovine. VI. Chang...,Summary Urine and blood samples were collected...,1971,23,51,0,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,26_hormone_estrogen_lh_pituitary,2.281632,3.871610,6.215812,19,16,632,478,"['reproductive', 'steroids', 'bovine', 'vi', '...","['summary', 'urine', 'blood', 'samples', 'coll..."
2,"{'id': 'fd4c7628-c16e-4b50-8555-3ac3ad6da2d7',...","Cortical Surface-Based Analysis II: Inflation,...",The surface of the human cerebral cortex is a ...,1999,65,5734,559,"['Mathematics', 'Medicine']","[{'category': 'Mathematics', 'source': 'extern...",['JournalArticle'],...,6_cortex_motor_memory_visual,2.925745,6.533843,4.116532,14,11,107,95,"['cortical', 'analysis', 'ii', 'inflation', 'f...","['surface', 'human', 'cerebral', 'cortex', 'hi..."
3,"{'id': 'dc31f077-7737-4e33-baa3-bceeff44ec27',...",Nonoperative dilatation of coronary-artery ste...,In percutaneous transluminal coronary angiopla...,1979,11,2422,38,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,31_md_ventricular_heart_facc,0.987944,6.179577,6.120787,11,9,203,183,"['nonoperative', 'dilatation', 'stenosis', 'pe...","['percutaneous', 'transluminal', 'coronary', '..."
4,"{'id': '5e31c5d5-eaf7-4832-aa9a-4565203cb02c',...",Reconstruction of the floor of the orbit by bo...,THE ORBITAL contour may be distorted by commin...,1950,9,75,0,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,84_eyelid_orbital_lid_levator,1.011518,4.734367,3.758004,11,10,165,151,"['reconstruction', 'floor', 'orbit', 'bone', '...","['orbital', 'contour', 'may', 'distorted', 'co..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19321,"{'id': 'e0a7f739-4012-4339-ad3e-65a6f144e31b',...",Fracture and Dislocation Classification Compen...,"From the *Department of Orthopaedic Surgery, M...",2007,42,1397,44,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,43_fractures_spinal_cervical_cord,1.720458,4.986771,4.262669,5,5,270,212,"['fracture', 'dislocation', 'classification']","['department', 'orthopaedic', 'surgery', 'mcgo..."
19322,"{'id': 'dc31f077-7737-4e33-baa3-bceeff44ec27',...",A comparison of rate control and rhythm contro...,BACKGROUND\nMaintenance of sinus rhythm is the...,2002,16,1803,18,['Medicine'],"[{'category': 'Medicine', 'source': 'external'...","['Study', 'JournalArticle', 'ClinicalTrial']",...,31_md_ventricular_heart_facc,1.222447,6.261564,6.164528,16,15,303,273,"['comparison', 'rate', 'control', 'rhythm', 'c...","['background', 'maintenance', 'sinus', 'rhythm..."
19323,"{'id': '3456d2d7-a9b4-495f-a7a0-a238096ee07a',...",Type I IFNs enhance the terminal differentiati...,This study identifies type I IFNs as activatin...,1998,45,666,22,"['Medicine', 'Biology']","[{'category': 'Medicine', 'source': 'external'...",['JournalArticle'],...,21_cells_class_mhc_cell,-0.536561,3.366763,6.968517,11,10,286,243,"['type', 'ifns', 'enhance', 'terminal', 'diffe...","['identifies', 'type', 'ifns', 'activating', '..."
19324,"{'id': '2132f348-fc4d-44c0-8717-e40a728868af',...",PROTOPLASTS AND L-TYPE GROWTH OF ESCHERICHIA COLI,"A preceding article (Lederberg, 1956a) was dev...",1958,74,216,3,"['Biology', 'Medicine']","[{'category': 'Biology', 'source': 'external'}...",['JournalArticle'],...,1_coli_escherichia_escherichia coli_strains,1.577343,3.447440,9.050930,7,7,294,258,"['protoplasts', 'growth', 'escherichia', 'coli']","['preceding', 'article', 'lederberg', '1956a',..."


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (assuming you've already loaded it)
df = pd.read_csv('clean_topified_vectorized_Science1900_2023.csv')

# Step 1: Data Preparation
# Clean the data, handle missing values, convert categorical variables if needed.
#print("\nStep 2: Data Cleaning")
# Check for missing values
#print("Missing values:\n", df.isnull().sum())
# Remove duplicates
df.drop_duplicates(inplace=True)

df.drop(['Unnamed: 0','paperId','externalIds'], axis=1)
df = df[df['topic_code'] != -1]

# Split the data into training and testing sets.
X = df[['title', 'abstract', 'year']]
y = df['topic_code']

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Feature Engineering
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# Tokenize the text and remove stopwords
stop_words = set(stopwords.words('english'))
custom_stopwords = {"two", "one", "results", "e", "study", "also", "found", "used", "p", "results", "group", "using", "n", "b", "c", "h", "r", ""}
stop_words.update(custom_stopwords)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'] + ' ' + X_train['abstract'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'] + ' ' + X_test['abstract'])

# Step 3: Select a Machine Learning Algorithm
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 4: Model Training
clf.fit(X_train_tfidf, y_train)

# Step 5: Model Evaluation
y_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Model Accuracy: 0.67
Precision: 0.65
Recall: 0.67
F1 Score: 0.63


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:

# Create an empty dictionary for mapping topic codes to topic names
topic_mapping = {}

# Iterate through the rows of the 'topics_df' DataFrame to create the mapping
for index, row in df.iterrows():
    topic_code = row['topic_code']
    topic_list = row['topic_list']
    topic_mapping[topic_code] = topic_list
# Now 'topic_mapping' contains the mapping of topic codes to topic names


In [4]:
topic_mapping

{20: '20_hearing_ear_auditory_hearing loss',
 26: '26_hormone_estrogen_lh_pituitary',
 6: '6_cortex_motor_memory_visual',
 31: '31_md_ventricular_heart_facc',
 84: '84_eyelid_orbital_lid_levator',
 1: '1_coli_escherichia_escherichia coli_strains',
 28: '28_ca2_ca2i_norepinephrine_membrane',
 14: '14_graphene_adsorption_electrode_surface',
 96: '96_cnv_choroidal_neovascularization_choroidal neovascularization',
 47: '47_cyclin_kinase_cyclin ecdk2_ecdk2',
 3: '3_political_politics_language_social',
 115: '115_acuity_visual_visual acuity_myopia',
 38: '38_melanoma_tumor_choroidal_retinoblastoma',
 68: '68_egovernment_government_services_citizens',
 35: '35_graph_algorithm_graphs_problem',
 29: '29_tumors_tumor_carcinoma_cases',
 24: '24_sarscov2_covid19_coronavirus_vaccine',
 10: '10_vitamin_supplementation_antioxidant_intake',
 0: '0_species_soil_plant_plants',
 16: '16_bone_collagen_cells_tissue',
 44: '44_corneal_endothelial_corneas_keratoplasty',
 52: '52_eselectin_endothelial_adhesio

In [5]:
# Map topic codes to topic names for predictions
predicted_topic_names = [topic_mapping[code] for code in y_pred]

# # Display the names of the predicted topics
# print("Predicted Topic Names:")
# for name in predicted_topic_names:
#     print(name)

# Display the names of the top 10 predicted topics
print("Top 10 Predicted Topic Names:")
for name in predicted_topic_names[:10]:
    print(name)

Top 10 Predicted Topic Names:
92_bfue_marrow_cfue_erythroid
29_tumors_tumor_carcinoma_cases
16_bone_collagen_cells_tissue
1_coli_escherichia_escherichia coli_strains
31_md_ventricular_heart_facc
8_microfluidic_device_cell_flow
20_hearing_ear_auditory_hearing loss
84_eyelid_orbital_lid_levator
14_graphene_adsorption_electrode_surface
57_detachment_vitreous_retinal_eyes


In [6]:
future_years = range(2023, 2034)  # Next 10 years

future_data = pd.DataFrame({'year': future_years, 'title': [''] * len(future_years), 'abstract': [''] * len(future_years)})

#Apply TF-IDF vectorization to future data
future_data_tfidf = tfidf_vectorizer.transform(future_data['title'] + ' ' + future_data['abstract'])
#Make predictions for future years
future_predictions = clf.predict(future_data_tfidf)
future_topic_names = [topic_mapping[code] for code in future_predictions]


In [7]:
future_topic_names

['7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para',
 '7_da_que_em_para']

In [8]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# from nltk.corpus import stopwords
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Load the dataset (assuming you've already loaded it)
# df = pd.read_csv('clean_topified_vectorized_Science1900_2023.csv')

# # Create an empty dictionary for mapping topic codes to topic names
# topic_mapping = {}

# # Iterate through the rows of the 'topics_df' DataFrame to create the mapping
# for index, row in df.iterrows():
#     topic_code = row['topic_code']
#     topic_list = row['topic_list']
#     topic_mapping[topic_code] = topic_list

# # Step 1: Data Preparation
# # Clean the data, handle missing values, convert categorical variables if needed.

# #print("\nStep 2: Data Cleaning")
# # Check for missing values
# #print("Missing values:\n", df.isnull().sum())

# # Remove duplicates
# df.drop_duplicates(inplace=True)

# df.drop(['Unnamed: 0','paperId','externalIds'], axis=1)
# # Drop rows where the 'topic' column contains the word 'covid'
# df = df[~df['topic_list'].str.contains('covid', case=False)]

# # Reset the index of the DataFrame after dropping rows
# df.reset_index(drop=True, inplace=True)

# # Split the data into training and testing sets.
# X = df[['title', 'abstract', 'year']]
# y = df['topic_code']

# # Split the data into training and testing sets (adjust test_size as needed)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Step 2: Feature Engineering
# tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# # Tokenize the text and remove stopwords
# stop_words = set(stopwords.words('english'))
# custom_stopwords = {"two", "one", "results", "e", "study", "also", "found", "used", "p", "results", "group", "using", "n", "b", "c", "h", "r", ""}
# stop_words.update(custom_stopwords)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'] + ' ' + X_train['abstract'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['title'] + ' ' + X_test['abstract'])

# # Step 3: Select a Machine Learning Algorithm
# clf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Step 4: Model Training
# clf.fit(X_train_tfidf, y_train)

# # Step 5: Model Evaluation
# y_pred = clf.predict(X_test_tfidf)
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')
# confusion = confusion_matrix(y_test, y_pred)

# print(f'Model Accuracy: {accuracy:.2f}')
# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')

# # Map topic codes to topic names for predictions
# predicted_topic_names = [topic_mapping[code] for code in y_pred]

# # # Display the names of the predicted topics
# # print("Predicted Topic Names:")
# # for name in predicted_topic_names:
# #     print(name)

# # Display the names of the top 10 predicted topics
# print("Top 10 Predicted Topic Names:")
# for name in predicted_topic_names[:10]:
#     print(name)


In [9]:
# # Split the data into training and testing sets.
# X = df[['title', 'abstract', 'year']]
# y = df['topic_code']  # Assuming you have a 'new_topic_label' column indicating the topic.

# # Split the data into training and testing sets (adjust test_size as needed)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Step 2: Feature Engineering
# # For text analysis, use TF-IDF vectorization to convert text data into numerical features.

# tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# # Tokenize the text and remove stopwords
# stop_words = set(stopwords.words('english'))

# # Add custom stopwords
# custom_stopwords = {"two", "one", "results", "e", "study","also","found","used","p","results","group","using","n","b","c","h","r",""}
# stop_words.update(custom_stopwords)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'] + ' ' + X_train['abstract'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['title'] + ' ' + X_test['abstract'])

# # Step 3: Select a Machine Learning Algorithm
# # Choose an appropriate machine learning algorithm (e.g., RandomForestClassifier).
# clf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Step 4: Model Training
# clf.fit(X_train_tfidf, y_train)

# # Step 5: Model Evaluation
# y_pred = clf.predict(X_test_tfidf)
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Model Accuracy: {accuracy:.2f}')

# # Step 6: Predict Future Topics
# # Create a dataset with future years and apply the model to make predictions.
# future_years = range(2023, 2034)  # Next 10 years
# future_data = pd.DataFrame({'year': future_years, 'title': [''] * len(future_years), 'abstract': [''] * len(future_years)})
# # Add other relevant features for prediction if needed.

# # Apply TF-IDF vectorization to future data
# future_data_tfidf = tfidf_vectorizer.transform(future_data['title'] + ' ' + future_data['abstract'])

# # Make predictions for future years
# future_predictions = clf.predict(future_data_tfidf)

# # The 'future_predictions' variable now contains the predicted new topics for the next 10 years.