In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Load Data
file_path = "./TABLE1.csv"  # Replace with your file
data = pd.read_csv(file_path)

# Step 1: Data Preparation
# Creating target variable (Predict if Subject_Field will be supported next year)
data['Year_Next'] = data['Year'] + 1
data['Target'] = data.duplicated(subset=['Country', 'Agency_Name', 'Subject_Field', 'Year_Next'], keep=False).astype(int)

# Dropping unnecessary columns
features = ['Country', 'Agency_Name', 'Subject_Field', 'Year']
X = data[features]
y = data['Target']

# Encoding categorical variables
X_encoded = pd.get_dummies(X, columns=['Country', 'Agency_Name', 'Subject_Field'], drop_first=True)

# Step 2: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [12]:
X_train

Unnamed: 0,Year,Country_Algeria,Country_Argentina,Country_Armenia,Country_Australia,Country_Austria,Country_Azerbaijan,Country_Bangladesh,Country_Barbados,Country_Belarus,...,Subject_Field_Mathematics,Subject_Field_Medicine,Subject_Field_Multidisciplinary,Subject_Field_Neuroscience,Subject_Field_Nursing,"Subject_Field_Pharmacology, Toxicology and Pharmaceutics",Subject_Field_Physics and Astronomy,Subject_Field_Psychology,Subject_Field_Social Sciences,Subject_Field_Veterinary
38390,2019,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
72857,2022,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
83327,2022,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
40808,2020,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
79760,2022,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54886,2021,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
76820,2022,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
103694,2017,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
860,2018,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [13]:
X_test

Unnamed: 0,Year,Country_Algeria,Country_Argentina,Country_Armenia,Country_Australia,Country_Austria,Country_Azerbaijan,Country_Bangladesh,Country_Barbados,Country_Belarus,...,Subject_Field_Mathematics,Subject_Field_Medicine,Subject_Field_Multidisciplinary,Subject_Field_Neuroscience,Subject_Field_Nursing,"Subject_Field_Pharmacology, Toxicology and Pharmaceutics",Subject_Field_Physics and Astronomy,Subject_Field_Psychology,Subject_Field_Social Sciences,Subject_Field_Veterinary
66027,2021,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
36617,2019,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
65302,2021,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
41974,2020,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
50274,2020,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60868,2021,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
67540,2021,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
23944,2019,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
46410,2020,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
y_train

38390     1
72857     0
83327     1
40808     0
79760     0
         ..
54886     0
76820     0
103694    1
860       1
15795     1
Name: Target, Length: 85885, dtype: int64

In [15]:
y_test

66027    1
36617    1
65302    0
41974    1
50274    1
        ..
60868    1
67540    1
23944    1
46410    1
16196    0
Name: Target, Length: 21472, dtype: int64

In [16]:

# # Step 3: Random Forest Model
# rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
# rf_model.fit(X_train, y_train)

# # Prediction and Evaluation
# y_pred = rf_model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

# # Feature Importances
# feature_importances = pd.DataFrame({
#     'Feature': X_encoded.columns,
#     'Importance': rf_model.feature_importances_
# }).sort_values(by='Importance', ascending=False)

# # Step 4: Export Predictions for Tableau
# # predictions = pd.DataFrame({
# #     'Eid': data['Eid'],
# #     'Predicted_Support': rf_model.predict(X_encoded)
# # })
# #predictions.to_csv("/mnt/data/predictions.csv", index=False)

# # Step 5: Visualization in Python
# # Aggregate data for Line Chart
# trend_data = data.groupby(['Year', 'Country', 'Subject_Field']).size().reset_index(name='Support_Count')
# plt.figure(figsize=(12, 6))
# sns.lineplot(data=trend_data, x='Year', y='Support_Count', hue='Country')
# plt.title("Trend of Research Support by Country")
# plt.xlabel("Year")
# plt.ylabel("Support Count")
# plt.show()
