In [21]:
import pandas as pd
test = pd.read_csv('./TABLE1_5.csv')
test['Eid']

0        2-s2.0-85053164279
1        2-s2.0-85053164279
2        2-s2.0-85054140369
3        2-s2.0-85041527766
4        2-s2.0-85053470598
                ...        
27122    2-s2.0-85015335911
27123    2-s2.0-85014089683
27124    2-s2.0-85013374519
27125    2-s2.0-84995751226
27126    2-s2.0-84995567069
Name: Eid, Length: 27127, dtype: object

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# -----------------------------------------------------------------------
# 1. Load Data
# -----------------------------------------------------------------------
data = pd.read_csv('./TABLE1_5_2.csv')
# ข้อมูลต้องมีคอลัมน์: Eid, Title, Subject_Field, Index_Terms, Has_Funding, Funding_Agency

# แทน NaN ใน Index_Terms และ Subject_Field ด้วย ""
data['Index_Terms'] = data['Index_Terms'].fillna("").astype(str)
data['Subject_Field'] = data['Subject_Field'].fillna("").astype(str)

# สร้างฟีเจอร์ข้อความรวม Title + Index_Terms
data['text_all'] = data['Title'].fillna("") + " " + data['Index_Terms'].fillna("")

# แยก Subject Field ที่มีหลายค่าด้วยตัวคั่น เช่น ',' หรือ ';'
data['Subject_Field'] = data['Subject_Field'].apply(lambda x: [field.strip() for field in x.split(',')])

# -----------------------------------------------------------------------
# 2. แบ่งชุดข้อมูลสำหรับ Has_Funding (Binary Classification)
# -----------------------------------------------------------------------
X_text = data['text_all']
X_subject = data['Subject_Field']
y = data['Has_Funding'].astype(int)  # 0 หรือ 1

# Split Dataset
X_train_text, X_test_text, X_train_subject, X_test_subject, y_train, y_test = train_test_split(
    X_text, X_subject, y, test_size=0.2, random_state=42)

# -----------------------------------------------------------------------
# 3. Preprocess Features
# -----------------------------------------------------------------------
# Text Feature Transformation
text_transformer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_text_tfidf = text_transformer.fit_transform(X_train_text)
X_test_text_tfidf = text_transformer.transform(X_test_text)

# Subject Field Transformation with MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X_train_subject_binarized = mlb.fit_transform(X_train_subject)
X_test_subject_binarized = mlb.transform(X_test_subject)

# Combine Features
X_train_combined = hstack([X_train_text_tfidf, X_train_subject_binarized])
X_test_combined = hstack([X_test_text_tfidf, X_test_subject_binarized])

# -----------------------------------------------------------------------
# 4. Train Binary Classification Model
# -----------------------------------------------------------------------
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_combined, y_train)

# Evaluate the Model
y_pred = model.predict(X_test_combined)
print("Binary Classification (Has_Funding) Report:")
print(classification_report(y_test, y_pred))

# -----------------------------------------------------------------------
# 5. Analyze Results
# -----------------------------------------------------------------------
print("Top 10 Subject Fields by Frequency:")
subject_counts = data['Subject_Field'].explode().value_counts()
print(subject_counts.head(10))


Binary Classification (Has_Funding) Report:
              precision    recall  f1-score   support

           0       0.69      0.66      0.67      1742
           1       0.58      0.61      0.60      1342

    accuracy                           0.64      3084
   macro avg       0.63      0.64      0.64      3084
weighted avg       0.64      0.64      0.64      3084

Top 10 Subject Fields by Frequency:
Subject_Field
Medicine                                2561
Biochemistry                            1192
Multidisciplinary                        893
Genetics and Molecular Biology           676
Physics and Astronomy                    669
Toxicology and Pharmaceutics             492
Agricultural and Biological Sciences     472
Engineering                              342
Pharmacology                             319
Veterinary                               272
Name: count, dtype: int64




In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# -----------------------------------------------------------------------
# 1. Load Data
# -----------------------------------------------------------------------
data = pd.read_csv('TABLE1_5.csv')  # Adjust file path

# Filter for projects with funding
funded_data = data[data['Has_Funding'] == 1].copy()

# Fill missing values
funded_data['Agency_Name'] = funded_data['Agency_Name'].fillna("")
funded_data['Subject_Field'] = funded_data['Subject_Field'].fillna("")
funded_data['Title'] = funded_data['Title'].fillna("")
funded_data['Index_Terms'] = funded_data['Index_Terms'].fillna("")

# Combine Title and Index_Terms
funded_data['combined_text'] = funded_data['Title'] + " " + funded_data['Index_Terms']

# -----------------------------------------------------------------------
# 2. Prepare Target (Agency_Name)
# -----------------------------------------------------------------------
# Convert Agency_Name to a list
funded_data['Agency_List'] = funded_data['Agency_Name'].apply(lambda x: [a.strip() for a in x.split(' ') if a.strip() != ""])

# Use MultiLabelBinarizer to encode targets
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(funded_data['Agency_List'])

# -----------------------------------------------------------------------
# 3. Prepare Features (X)
# -----------------------------------------------------------------------
# Use TfidfVectorizer for text data
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_text_tfidf = tfidf.fit_transform(funded_data['combined_text'])

# Use OneHotEncoder for categorical data (Subject_Field)
encoder = OneHotEncoder(handle_unknown='ignore')
X_subject_onehot = encoder.fit_transform(funded_data[['Subject_Field']])

# Combine features
from scipy.sparse import hstack
X_combined = hstack([X_text_tfidf, X_subject_onehot])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# -----------------------------------------------------------------------
# 4. Train Multi-Label Model
# -----------------------------------------------------------------------
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
model.fit(X_train, y_train)

# -----------------------------------------------------------------------
# 5. Evaluate Model
# -----------------------------------------------------------------------
y_pred = model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=mlb.classes_)
print("Multi-Label Classification Report:")
print(report)




Multi-Label Classification Report:
                                                   precision    recall  f1-score   support

                                                &       1.00      0.02      0.05        41
                                               &D       0.00      0.00      0.00        22
                                             (863       0.00      0.00      0.00         0
                                     (Lithuania),       0.00      0.00      0.00         1
                                          (MOHE),       0.00      0.00      0.00         0
                                          (Spain)       0.00      0.00      0.00         0
                                          (Taipei       0.00      0.00      0.00         0
                                         (Taiwan)       0.00      0.00      0.00         3
                                            (UK),       0.00      0.00      0.00         1
                                            (USA)     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
# Check label distribution
label_counts = funded_data['Agency_List'].explode().value_counts()
print(label_counts)


Agency_List
University       14678
National         10205
Chulalongkorn     9902
Thailand          8711
Organization      7875
                 ...  
SØR-ØST              1
Breast               1
VentureWise          1
Connell              1
Rajasthan            1
Name: count, Length: 3756, dtype: int64


In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# -----------------------------------------------------------------------
# 1. Load Data
# -----------------------------------------------------------------------
data = pd.read_csv('TABLE1_5.csv')  # Adjust file path

# Filter for projects with funding
funded_data = data[data['Has_Funding'] == 1].copy()

# Fill missing values
funded_data['Agency_Name'] = funded_data['Agency_Name'].fillna("")
funded_data['Subject_Field'] = funded_data['Subject_Field'].fillna("")
funded_data['Title'] = funded_data['Title'].fillna("")
funded_data['Index_Terms'] = funded_data['Index_Terms'].fillna("")

# Combine Title and Index_Terms
funded_data['combined_text'] = funded_data['Title'] + " " + funded_data['Index_Terms']

# Split Agency_Name into lists
funded_data['Agency_List'] = funded_data['Agency_Name'].apply(lambda x: [a.strip() for a in x.split(' ') if a.strip() != ""])

# -----------------------------------------------------------------------
# 2. Remove Rare Labels
# -----------------------------------------------------------------------
# Calculate label frequencies
label_counts = funded_data['Agency_List'].explode().value_counts()

# Define minimum frequency for labels
min_frequency = 10
valid_labels = label_counts[label_counts >= min_frequency].index

# Remove rare labels
funded_data['Agency_List'] = funded_data['Agency_List'].apply(
    lambda x: [agency for agency in x if agency in valid_labels]
)

# Remove rows with no valid labels
funded_data = funded_data[funded_data['Agency_List'].apply(len) > 0]

# Encode target labels
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(funded_data['Agency_List'])

# -----------------------------------------------------------------------
# 3. Prepare Features (X)
# -----------------------------------------------------------------------
# Use TfidfVectorizer for text data
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_text_tfidf = tfidf.fit_transform(funded_data['combined_text'])

# Use OneHotEncoder for categorical data (Subject_Field)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
X_subject_onehot = encoder.fit_transform(funded_data[['Subject_Field']])

# Combine features
X_combined = hstack([X_text_tfidf, X_subject_onehot])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# -----------------------------------------------------------------------
# 4. Train Multi-Label Model
# -----------------------------------------------------------------------
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
model.fit(X_train, y_train)

# -----------------------------------------------------------------------
# 5. Evaluate Model
# -----------------------------------------------------------------------
y_pred = model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=mlb.classes_)
print("Multi-Label Classification Report:")
print(report)

# -----------------------------------------------------------------------
# 6. Analyze Results
# -----------------------------------------------------------------------
# Print top 10 labels by frequency
print("Top 10 Labels by Frequency:")
print(label_counts.head(10))

# Print labels removed due to low frequency
removed_labels = label_counts[label_counts < min_frequency]
print("\nRemoved Labels (Rare):")
print(removed_labels)


Multi-Label Classification Report:
                        precision    recall  f1-score   support

                     &       0.00      0.00      0.00        47
                    &D       0.00      0.00      0.00        23
                     ,       0.00      0.00      0.00        50
                     -       0.73      0.32      0.44        25
                 100th       0.00      0.00      0.00         4
      100thAnniversary       0.00      0.00      0.00         2
                  2010       0.00      0.00      0.00         1
                  2020       0.00      0.00      0.00         5
                A*STAR       0.00      0.00      0.00        20
                    A.       0.50      0.45      0.47        20
                  A.G.       0.36      0.22      0.28        18
                  AIDS       0.00      0.00      0.00         5
                  AMED       0.00      0.00      0.00         7
            ANID-Chile       0.00      0.00      0.00         5
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack

# -----------------------------------------------------------------------
# 1. Load Data
# -----------------------------------------------------------------------
data = pd.read_csv('TABLE1_5.csv')  # Adjust file path

# Filter for projects with funding
funded_data = data[data['Has_Funding'] == 1].copy()

# Fill missing values
funded_data['Agency_Name'] = funded_data['Agency_Name'].fillna("")
funded_data['Subject_Field'] = funded_data['Subject_Field'].fillna("")
funded_data['Title'] = funded_data['Title'].fillna("")
funded_data['Index_Terms'] = funded_data['Index_Terms'].fillna("")

# Combine Title and Index_Terms
funded_data['combined_text'] = funded_data['Title'] + " " + funded_data['Index_Terms']

# Split Agency_Name into lists
funded_data['Agency_List'] = funded_data['Agency_Name'].apply(lambda x: [a.strip() for a in x.split(' ') if a.strip() != ""])

# -----------------------------------------------------------------------
# 2. Remove Rare Labels
# -----------------------------------------------------------------------
# Calculate label frequencies
label_counts = funded_data['Agency_List'].explode().value_counts()

# Define minimum frequency for labels
min_frequency = 10
valid_labels = label_counts[label_counts >= min_frequency].index

# Remove rare labels
funded_data['Agency_List'] = funded_data['Agency_List'].apply(
    lambda x: [agency for agency in x if agency in valid_labels]
)

# Remove rows with no valid labels
funded_data = funded_data[funded_data['Agency_List'].apply(len) > 0]

# Encode target labels
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(funded_data['Agency_List'])

# -----------------------------------------------------------------------
# 3. Prepare Features (X)
# -----------------------------------------------------------------------
# Use TfidfVectorizer for text data
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_text_tfidf = tfidf.fit_transform(funded_data['combined_text'])

# Use OneHotEncoder for categorical data (Subject_Field)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
X_subject_onehot = encoder.fit_transform(funded_data[['Subject_Field']])

# Combine features
X_combined = hstack([X_text_tfidf, X_subject_onehot])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# -----------------------------------------------------------------------
# 4. Apply SMOTE for Each Label
# -----------------------------------------------------------------------
# Resample each label independently
smote = SMOTE(random_state=42)
X_train_resampled = X_train
y_train_resampled = y_train.copy()

for i in range(y_train.shape[1]):  # Loop through each label
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train[:, i])
    if i == 0:  # Initialize combined resampled arrays
        X_train_resampled = X_resampled
        y_train_resampled = y_resampled.reshape(-1, 1)
    else:  # Append resampled labels
        y_train_resampled = np.hstack((y_train_resampled, y_resampled.reshape(-1, 1)))

# -----------------------------------------------------------------------
# 5. Train Multi-Label Model
# -----------------------------------------------------------------------
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
model.fit(X_train_resampled, y_train_resampled)

# -----------------------------------------------------------------------
# 6. Evaluate Model
# -----------------------------------------------------------------------
y_pred = model.predict(X_test)

# Generate classification report with zero_division to handle undefined metrics
report = classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0)
print("Multi-Label Classification Report:")
print(report)

# Instance-level accuracy
instance_accuracy = (y_pred == y_test).all(axis=1).mean()
print(f"Instance-Level Accuracy: {instance_accuracy:.2f}")

# -----------------------------------------------------------------------
# 7. Analyze Results
# -----------------------------------------------------------------------
# Print top 10 labels by frequency
print("\nTop 10 Labels by Frequency:")
print(label_counts.head(10))

# Print labels removed due to low frequency
removed_labels = label_counts[label_counts < min_frequency]
print("\nRemoved Labels (Rare):")
print(removed_labels)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 20958 and the array at index 1 has size 21134