In [1]:
import gensim.downloader as api

info_datasets = api.info()
print(info_datasets)
dataset_info = api.info("text8")
dataset = api.load("text8")
word2vec_model = api.load('word2vec-google-news-300')

{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1, 'record_format': 'dict', 'file_size': 6344358, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py', 'license': 'All files released for the task are free for general research use', 'fields': {'2016-train': ['...'], '2016-dev': ['...'], '2017-test': ['...'], '2016-test': ['...']}, 'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.', 'checksum': '701ea67acd82e75f95e1d8e62fb0ad29', 'file_name': 'semeval-2016-2017-task3-subtaskBC.gz',

In [4]:
word2vec_model.similar_by_word('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413195133209229),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797566771507263),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('data/development.csv')
print(f"Dataset shape: {df.shape}")
print(f"Labels distribution:\n{df['label'].value_counts().sort_index()}")

In [None]:
# Preprocessing: combine source + article, compute article length
def preprocess_text(text):
    """Tokenize and clean text for Word2Vec."""
    if pd.isna(text):
        return []
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    tokens = word_tokenize(text)
    return [t for t in tokens if len(t) > 1]

def document_to_vector(tokens, model, dim=300):
    """Convert tokens to averaged Word2Vec vector."""
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

# Combine source + article into single text
df['combined_text'] = df['source'].fillna('') + ' ' + df['article'].fillna('')

# Calculate article length (number of characters)
df['article_length'] = df['article'].fillna('').apply(len)

# Tokenize combined text
df['tokens'] = df['combined_text'].apply(preprocess_text)

print(f"Sample combined text tokens: {df['tokens'].iloc[0][:10]}")
print(f"Article length stats:\n{df['article_length'].describe()}")

In [None]:
# Create Word2Vec features (this may take a few minutes)
print("Creating Word2Vec features...")
w2v_features = np.array([document_to_vector(tokens, word2vec_model) for tokens in df['tokens']])
print(f"Word2Vec features shape: {w2v_features.shape}")

# Create feature DataFrame
feature_cols = [f'w2v_{i}' for i in range(300)]
df_features = pd.DataFrame(w2v_features, columns=feature_cols)

# Add additional features: page_rank and article_length
df_features['page_rank'] = df['page_rank'].values
df_features['article_length'] = df['article_length'].values

# Add label
df_features['label'] = df['label'].values

print(f"\nFinal feature DataFrame shape: {df_features.shape}")
print(f"Features: 300 Word2Vec + page_rank + article_length = 302 features")

In [None]:
# Prepare train/test split
feature_columns = [f'w2v_{i}' for i in range(300)] + ['page_rank', 'article_length']

X = df_features[feature_columns].values
y = df_features['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Class {u}: {c} samples ({c/len(y_train)*100:.1f}%)")

In [None]:
# Build Pipeline with StandardScaler + Classifier
from sklearn.metrics import f1_score

# Define base pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Quick baseline evaluation
print("Baseline evaluation with default RandomForest...")
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Baseline Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Baseline Macro F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")

In [None]:
# Hyperparameter tuning with GridSearchCV
# Using Macro F1 as scoring metric

param_grid = [
    # Random Forest
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5],
    },
    # Logistic Regression
    {
        'classifier': [LogisticRegression(random_state=42, max_iter=1000)],
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__solver': ['lbfgs', 'saga'],
    },
    # Gradient Boosting
    {
        'classifier': [GradientBoostingClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__max_depth': [3, 5],
    },
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Starting GridSearchCV with Macro F1 scoring...")
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='f1_macro',  # Macro F1 score
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

grid_search.fit(X_train, y_train)

In [None]:
# Display GridSearchCV results
print("=" * 60)
print("HYPERPARAMETER TUNING RESULTS (Macro F1)")
print("=" * 60)

print(f"\nBest Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest Cross-Validation Macro F1: {grid_search.best_score_:.4f}")

# Results DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')
print(f"\nTop 5 Configurations:")
print(results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head())

In [None]:
# Final evaluation on test set with best model
from sklearn.metrics import f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("=" * 60)
print("FINAL EVALUATION ON TEST SET")
print("=" * 60)

best_model = grid_search.best_estimator_
y_pred_final = best_model.predict(X_test)

print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")
print(f"Test Macro F1: {f1_score(y_test, y_pred_final, average='macro'):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_final))

# Label mapping for reference
label_names = {
    0: 'International News',
    1: 'Business',
    2: 'Technology',
    3: 'Entertainment',
    4: 'Sports',
    5: 'General News',
    6: 'Health'
}
print("\nLabel Reference:")
for k, v in label_names.items():
    print(f"  {k}: {v}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_final)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_names.values(),
            yticklabels=label_names.values())
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Save the best model for later use
import joblib

joblib.dump(best_model, 'model/best_classifier.joblib')
joblib.dump(word2vec_model, 'model/word2vec_model.joblib')

print("Models saved to 'model/' directory")
print(f"  - best_classifier.joblib")
print(f"  - word2vec_model.joblib")