In [33]:
import warnings
warnings.filterwarnings('ignore')

# Creating the best TF-IDF Vectorizer

In [34]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [35]:
# Load the train dataset

train_df = pd.read_csv("../No trans data/train_processed.csv")
train_df.head()

Unnamed: 0,id,text,label
0,3268,indha ariya kandupidippin moolam neenga solla ...,0
1,6239,@vijayakumarp7959 unmai therincha nee pesu,0
2,5859,inga erukka yella dev... boys vadakkan vadakka...,1
3,3519,பீகாரி பிரசாந்த் கிஷோரிடம் கொடுத்த 350 கோடியை ...,1
4,5136,mumbai bangalore la 80% percentage outsiders,1


In [36]:
# Define parameter grid for TF-IDF

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Unigrams, Bigrams, Trigrams
    'tfidf__max_df': [0.5, 0.75, 1.0],  # Lower values remove frequent words
    'tfidf__min_df': [1, 2, 5],  # Higher values remove rare words
    'tfidf__stop_words': ['english', None],  # With and without stopword removal
    'tfidf__smooth_idf': [True, False],  # Apply IDF smoothing
    'tfidf__sublinear_tf': [True, False],  # Use log(tf) scaling
    'tfidf__use_idf': [True, False]  # With and without IDF weighting
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=12)
grid_search.fit(train_df['text'], train_df['label'])
best_tfidf = grid_search.best_estimator_.named_steps['tfidf']

print("Best TF-IDF Parameters:", grid_search.best_params_)

Best TF-IDF Parameters: {'tfidf__max_df': 0.5, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1), 'tfidf__smooth_idf': True, 'tfidf__stop_words': 'english', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}


In [37]:
# Save the best TF-IDF model

joblib.dump(best_tfidf, "../No trans data/tfidf_vectorizer_2.pkl")
print("TF-IDF model saved successfully.")

TF-IDF model saved successfully.
