In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline


import warnings

# Turn off all warnings
warnings.filterwarnings("ignore")



AttributeError: module 'numpy' has no attribute 'VisibleDeprecationWarning'

In [3]:
# Function to create submission CSV
def create_submission_csv(y_pred, test_id, model_name):
    """
    Creates a CSV file for submission.
    
    Parameters:
    y_pred (array-like): Predicted labels.
    test_id (array-like): IDs corresponding to the test set.
    model_name (str): Name of the model used for predictions.
    
    Returns:
    str: The name of the created CSV file.
    """
    if len(y_pred) != len(test_id):
        raise ValueError("Length of y_pred and test_id must be the same.")
    
    submission = pd.DataFrame({
        'id': test_id,
        'label': y_pred
    })
    
    submission_file = f'{model_name}.csv'
    submission.to_csv(submission_file, index=False)
    print(f"Submission file created: {submission_file}")
    return submission_file


In [4]:
def estimate_optimal_n_components(data, method="PCA", target_variance=0.95):
    """
    Estimate the optimal number of components to retain a given amount of variance.
    
    Parameters:
    - data (pd.DataFrame or np.array): The input dataset.
    - method (str): The dimensionality reduction method, "PCA" or "SVD".
    - target_variance (float): The target amount of variance to retain (e.g., 0.95 for 95%).
    
    Returns:
    - optimal_components (int): The optimal number of components to retain the target variance.
    - cumulative_variance (list): The cumulative variance explained by the components.
    """
    if method == "PCA":
        model = PCA()
        model.fit(data)
        explained_variance = np.cumsum(model.explained_variance_ratio_)
    elif method == "SVD":
        model = TruncatedSVD()
        model.fit(data)
        explained_variance = np.cumsum(model.explained_variance_ratio_)
    else:
        raise ValueError("Method must be either 'PCA' or 'SVD'")
    
    # Find the number of components that meet the target variance
    optimal_components = np.argmax(explained_variance >= target_variance) + 1
    
    # Plot the cumulative explained variance
    plt.figure(figsize=(8, 6))
    plt.plot(explained_variance, marker='o', linestyle='--', color='b')
    plt.axhline(y=target_variance, color='r', linestyle='--')
    plt.title(f'Cumulative Explained Variance ({method})')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid(True)
    plt.show()
    
    print(f"Optimal number of components to retain {target_variance*100}% variance: {optimal_components}")
    
    return optimal_components, explained_variance

In [5]:
# Dimensionality Reduction using PCA or SVD
def apply_dimensionality_reduction(data, n_components=1000, method="PCA"):
    if method == "PCA":
        model = PCA(n_components=n_components)
    elif method == "SVD":
        model = TruncatedSVD(n_components=n_components)
    else:
        raise ValueError("Method must be either 'PCA' or 'SVD'")
    
    # Apply dimensionality reduction
    transformed_data = model.fit_transform(data)
    
    # Drop zero-variance components
    explained_variance = model.explained_variance_ratio_
    transformed_data = transformed_data[:, explained_variance > 0]
    
    return pd.DataFrame(transformed_data)

In [6]:
# Function to build classifiers
def build_classifiers_with_params():
    classifiers = [
        ("KNN", KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        ("Linear SVM", SVC(kernel="linear", random_state=42), {'C': [0.01, 0.1, 1]}),
        ("RBF SVM", SVC(gamma="scale", random_state=42), {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}),
        ("Decision Tree", DecisionTreeClassifier(random_state=42), {'max_depth': [3, 5, 7]}),
        ("Random Forest", RandomForestClassifier(random_state=42), {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 7]}),
        ("AdaBoost", AdaBoostClassifier(random_state=42), {'n_estimators': [50, 100], 'learning_rate': [0.1, 1]}),
        ("Naive Bayes", GaussianNB(), {}),
        ("QDA", QuadraticDiscriminantAnalysis(), {})
    ]
    return classifiers

In [7]:
# Pipeline to apply PCA/SVD, Grid Search, and evaluate classifiers
def pipeline_with_grid_search(train_features, train_label, n_components=100, method="PCA"):
    # Reduce dimensions
    reduced_data = apply_dimensionality_reduction(train_features, n_components=n_components, method=method)
    
    # Split the train set into train (80%) and validation (20%) sets
    X_train, X_val, y_train, y_val = train_test_split(reduced_data, train_label, test_size=0.2, random_state=42)
    
    classifiers = build_classifiers_with_params()
    results = {}

    # Apply each classifier with GridSearchCV
    for name, clf, param_grid in classifiers:
        print(f"\nRunning Grid Search for {name}...")
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        
        # Best parameters and model evaluation
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_val)
        
        # Evaluate
        accuracy = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average="weighted")
        results[name] = {"accuracy": accuracy, "f1_score": f1, "best_params": grid_search.best_params_}
        
        print(f"Best Params for {name}: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
    
    return results

In [8]:
# Suppress ConvergenceWarning for readability
# warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load the datasets
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_tfidf = pd.read_csv('data/train_tfidf_features.csv')
test_tfidf = pd.read_csv('data/test_tfidf_features.csv')

# Extract IDs from the datasets
train_ids = train_data.iloc[:, 0]  # Assuming the ID is the first column
test_ids = test_data["id"]

# Remove the first two columns from the TF-IDF features
train_features = train_tfidf.iloc[:, 2:]
test_features = test_tfidf.iloc[:, 2:]

# Ensure the columns in test_features are in the same order and have the same names as in train_features
common_columns = [col for col in train_features.columns if col in test_features.columns]
train_features = train_features[common_columns]
test_features = test_features[common_columns]

# Define labels
train_label = train_data["label"]

# # Split the train set into train (80%) and validation (20%) sets
# X_train, X_val, y_train, y_val = train_test_split(train_features, train_label, test_size=0.2, random_state=42)
# optimal_components, cumulative_variance = estimate_optimal_n_components(train_features, method="PCA", target_variance=0.95)
results = pipeline_with_grid_search(train_features, train_label, method="PCA", n_components=120)



Running Grid Search for KNN...
Fitting 5 folds for each of 3 candidates, totalling 15 fits



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 170, in <module>
    exitcode = process_obj._bootstrap()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/pytho

Best Params for KNN: {'n_neighbors': 5}
Accuracy: 0.6602
F1 Score: 0.6518

Running Grid Search for Linear SVM...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Params for Linear SVM: {'C': 1}
Accuracy: 0.6861
F1 Score: 0.6588

Running Grid Search for RBF SVM...
Fitting 5 folds for each of 9 candidates, totalling 45 fits



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 170, in <module>
    exitcode = process_obj._bootstrap()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/pytho

Best Params for RBF SVM: {'C': 10, 'gamma': 1}
Accuracy: 0.7056
F1 Score: 0.6928

Running Grid Search for Decision Tree...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Params for Decision Tree: {'max_depth': 7}
Accuracy: 0.6500
F1 Score: 0.6371

Running Grid Search for Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Params for Random Forest: {'max_depth': 7, 'n_estimators': 10}
Accuracy: 0.6701
F1 Score: 0.6319

Running Grid Search for AdaBoost...
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Params for AdaBoost: {'learning_rate': 1, 'n_estimators': 100}
Accuracy: 0.6817
F1 Score: 0.6742

Running Grid Search for Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params for Naive Bayes: {}
Accuracy: 0.6561
F1 Score: 0.6500

Running Grid Search for QDA...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params for QDA: {}
Accuracy: 0.6520
F1 Score: 0.6546
