In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
#Load the dataframe first 200 rows
data = pd.read_csv('../Dataset/preprocessed_dataset.csv', usecols=['text','label1'], nrows=200)
data = data.rename(columns={'label1': 'label'})

In [3]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

In [4]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [7]:
classifier = LogisticRegression(solver="lbfgs", C=1.0)

In [8]:
# Tune the regularization parameter (C)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_features, y_train)
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_



Best parameters: {'C': 100}


In [9]:
# Evaluate performance with the best model
y_pred = best_model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with best parameters: {accuracy:.4f}")

Accuracy with best parameters: 0.7250
