In [1]:
import pandas as pd
import time
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from tqdm import tqdm

# Start timing the whole process
start_time = time.time()

# Load the training data
train_data_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\BERT - Introduction\Training\Large\large_training_data_set_bert_introduction_embeddings.csv"
train_data = pd.read_csv(train_data_path)

# Shuffle the dataset
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Prepare the inputs and labels for training
X_train = train_data.drop(columns=['title', 'introduction', 'categories', 'infobox_type'])
y_train = train_data['infobox_type']

# Use the best parameters found
best_params = {'C': 4.594506741382034, 'coef0': 0.09541011649041131, 'degree': 4, 'gamma': 'scale', 'kernel': 'rbf'}

# Initialize the SVM classifier with the best parameters
best_svm_classifier = SVC(
    C=best_params['C'],
    coef0=best_params['coef0'],
    degree=best_params['degree'],
    gamma=best_params['gamma'],
    kernel=best_params['kernel'],
    random_state=42
)

# Fit the classifier to the training data
best_svm_classifier.fit(X_train, y_train)

# Notify training completion
training_end_time = time.time()
print(f"Training completed in {training_end_time - start_time:.2f} seconds")
print(f"Using parameters: {best_params}")

# Load the testing data
test_data_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\BERT - Introduction\Testing\testing_data_set_for_bert_introduction.csv"
test_data = pd.read_csv(test_data_path)

# Drop the 'categories_text' column from the test dataset if it exists and prepare the inputs and labels for testing
if 'categories_text' in test_data.columns:
    test_data = test_data.drop(columns=['categories_text'])
X_test = test_data.drop(columns=['title', 'introduction', 'categories', 'infobox_type'])
y_test = test_data['infobox_type']

# Process the test data in chunks
chunk_size = 100
total_chunks = len(test_data) // chunk_size + (1 if len(test_data) % chunk_size != 0 else 0)

results_df_list = []
for i in tqdm(range(total_chunks), desc="Processing test data"):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(test_data))
    X_test_chunk = X_test.iloc[start_idx:end_idx]
    y_test_chunk = y_test.iloc[start_idx:end_idx]
    titles_test_chunk = test_data['title'].iloc[start_idx:end_idx]

    # Predict the test data chunk
    y_pred_chunk = best_svm_classifier.predict(X_test_chunk)

    # Create a DataFrame with the chunk results
    chunk_results_df = pd.DataFrame({
        'Title': titles_test_chunk,
        'Predicted Label': y_pred_chunk,
        'True Label': y_test_chunk
    })
    results_df_list.append(chunk_results_df)

# Concatenate all results into a single DataFrame
results_df = pd.concat(results_df_list, ignore_index=True)

# Analyze misclassifications and identify the top 5 most misclassified labels
misclassified = results_df[results_df['Predicted Label'] != results_df['True Label']]
misclass_counts = misclassified['True Label'].value_counts()
top_misclassified = misclass_counts.head(5)
print("Top 5 Most Misclassified Labels:")
print(top_misclassified)

# End timing the whole process
end_time = time.time()
print(f"Testing completed in {end_time - training_end_time:.2f} seconds")

# Generate and print the classification report
report = classification_report(results_df['True Label'], results_df['Predicted Label'], output_dict=True)
print(classification_report(results_df['True Label'], results_df['Predicted Label']))

# Print total execution time
print(f"Total execution time: {end_time - start_time:.2f} seconds")

# Specify the path where you want to save the results
results_csv_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\BERT - Introduction\Training\Large\SVM\prediction_results.csv"

# Save results to CSV
results_df.to_csv(results_csv_path, index=False)
print(f"Results saved to {results_csv_path}")


Training completed in 784.66 seconds
Using parameters: {'C': 4.594506741382034, 'coef0': 0.09541011649041131, 'degree': 4, 'gamma': 'scale', 'kernel': 'rbf'}


Processing test data: 100%|██████████████████████████████████████████████████████████| 298/298 [41:18<00:00,  8.32s/it]


Top 5 Most Misclassified Labels:
True Label
infobox person                      67
infobox event                       52
infobox gridiron football person    44
infobox building                    44
infobox company                     43
Name: count, dtype: int64
Testing completed in 2482.84 seconds
                                               precision    recall  f1-score   support

                             infobox academic       0.59      0.58      0.59       100
                        infobox afl biography       0.99      1.00      1.00       100
                       infobox aircraft begin       0.95      0.99      0.97       100
                  infobox aircraft occurrence       0.99      0.95      0.97       100
                              infobox airline       0.96      0.98      0.97       100
                              infobox airport       0.96      0.94      0.95       100
                                infobox album       0.96      0.99      0.98       100
 