In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import re
print("Libraries loaded successfully!")

Libraries loaded successfully!


In [3]:
train = pd.read_csv("/kaggle/input/nlp-for-se-2025-programming-3-1st-competition/train.csv")
test = pd.read_csv("/kaggle/input/nlp-for-se-2025-programming-3-1st-competition/test_nolabel.csv")
val = pd.read_csv("/kaggle/input/nlp-for-se-2025-programming-3-1st-competition/val.csv")

In [4]:
print("Train dataset shape:", train.shape)
print("Validation dataset shape:", val.shape)
print("Test dataset shape:", test.shape)
print("\nTrain columns:", train.columns.tolist())
print("Sample of train data:\n", train.head())

Train dataset shape: (8298, 3)
Validation dataset shape: (1778, 3)
Test dataset shape: (1779, 2)

Train columns: ['id', 'label', 'text']
Sample of train data:
      id  label                                               text
0  8901      5  Bennett 's naturalistic performance speaks vol...
1  2506      5  Shot in rich , shadowy black-and-white , Devil...
2  2381      5  More than their unique residences , Home Movie...
3  1262      3  The movie should be credited with remembering ...
4  2542      4  Audiences are advised to sit near the back and...


In [6]:
def preprocess_text(text):
    """Simple text preprocessing"""
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    
    return text

In [7]:
# Preprocess text data
X_train = train['text'].apply(preprocess_text)
X_val = val['text'].apply(preprocess_text)
X_test = test['text'].apply(preprocess_text)

# Get labels
y_train = train['label']
y_val = val['label']

# Initialize TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)

# Transform text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize scaler
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_tfidf)
X_val_scaled = scaler.transform(X_val_tfidf)
X_test_scaled = scaler.transform(X_test_tfidf)

In [8]:
# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (200, 100)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['adaptive'],
    'max_iter': [1000]
}

# Initialize base model
base_model = MLPClassifier(random_state=42)

# Perform grid search
grid_search = GridSearchCV(
    base_model,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
print("Training model...")
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Training model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best parameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'adaptive', 'max_iter': 1000}
Best cross-validation score: 0.3444178885524013


In [9]:
val_predictions = grid_search.predict(X_val_scaled)

# Print validation results
print("\nValidation Results:")
print("Accuracy:", accuracy_score(y_val, val_predictions))
print("\nClassification Report:")
print(classification_report(y_val, val_predictions))


Validation Results:
Accuracy: 0.344206974128234

Classification Report:
              precision    recall  f1-score   support

           1       0.29      0.21      0.25       252
           2       0.39      0.45      0.42       458
           3       0.23      0.26      0.24       309
           4       0.36      0.35      0.36       489
           5       0.40      0.38      0.39       270

    accuracy                           0.34      1778
   macro avg       0.34      0.33      0.33      1778
weighted avg       0.34      0.34      0.34      1778



In [10]:
test_predictions = grid_search.predict(X_test_scaled)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'label': test_predictions
})

# Save predictions
submission.to_csv('submission.csv', index=False)
print("\nPredictions saved to submission.csv")


Predictions saved to submission.csv


In [11]:
print("\nSample of predictions:")
print(submission.head())


Sample of predictions:
      id  label
0   1005      4
1  10561      4
2  11048      2
3   7846      4
4  10615      3
