In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

data = pd.read_csv('./TABLE1_5_2.csv')

data['Index_Terms'] = data['Index_Terms'].fillna("").astype(str)
data['Subject_Field'] = data['Subject_Field'].fillna("").astype(str)


data['text_all'] = data['Title'].fillna("") + " " + data['Index_Terms'].fillna("")

data['Subject_Field'] = data['Subject_Field'].apply(lambda x: [field.strip() for field in x.split(',')])

X_text = data['text_all']
X_subject = data['Subject_Field']
y = data['Has_Funding'].astype(int)  # 0 หรือ 1

X_train_text, X_test_text, X_train_subject, X_test_subject, y_train, y_test = train_test_split(
    X_text, X_subject, y, test_size=0.2, random_state=42)

text_transformer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_text_tfidf = text_transformer.fit_transform(X_train_text)
X_test_text_tfidf = text_transformer.transform(X_test_text)

mlb = MultiLabelBinarizer()
X_train_subject_binarized = mlb.fit_transform(X_train_subject)
X_test_subject_binarized = mlb.transform(X_test_subject)

X_train_combined = hstack([X_train_text_tfidf, X_train_subject_binarized])
X_test_combined = hstack([X_test_text_tfidf, X_test_subject_binarized])


rf_model = RandomForestClassifier(
    bootstrap=True,
    max_depth=None,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)
rf_model.fit(X_train_combined, y_train)

# Evaluate the Model
y_pred = rf_model.predict(X_test_combined)
print("Binary Classification (Has_Funding) Report:")
print(classification_report(y_test, y_pred))






Binary Classification (Has_Funding) Report:
              precision    recall  f1-score   support

           0       0.72      0.65      0.68      1742
           1       0.60      0.67      0.63      1342

    accuracy                           0.66      3084
   macro avg       0.66      0.66      0.66      3084
weighted avg       0.66      0.66      0.66      3084



In [4]:
# ข้อมูลใหม่ที่ต้องการทำนาย
new_data = pd.DataFrame({
    'Title': ["Energy efficient cooling systems for households"],
    'Index_Terms': ["Air conditioning, Energy saving, Household cooling"],
    'Subject_Field': ["Energy, Engineering"]
})

# Preprocess ข้อมูลใหม่
new_data['Index_Terms'] = new_data['Index_Terms'].fillna("").astype(str)
new_data['text_all'] = new_data['Title'].fillna("") + " " + new_data['Index_Terms']
new_data['Subject_Field'] = new_data['Subject_Field'].apply(lambda x: [field.strip() for field in x.split(',')])

# Transform text features
new_text_tfidf = text_transformer.transform(new_data['text_all'])

# Transform subject features
new_subject_binarized = mlb.transform(new_data['Subject_Field'])

# Combine features
new_combined_features = hstack([new_text_tfidf, new_subject_binarized])

# ทำนายผล
new_predictions = rf_model.predict(new_combined_features)

# แสดงผลลัพธ์
print("Prediction:", new_predictions)  # Output: [0] หรือ [1]


Prediction: [0]


In [5]:
import pickle

# เซฟ RandomForest Model
with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

# เซฟ TF-IDF Vectorizer
with open("text_transformer.pkl", "wb") as f:
    pickle.dump(text_transformer, f)

# เซฟ MultiLabelBinarizer
with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)
