Comment classifier using New York Times comments dataset https://www.kaggle.com/datasets/aashita/nyt-comments

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [8]:
# Load the dataset
file_path = '/content/CommentsApril2017.csv'

# Initialize an empty DataFrame to store the data
data = pd.DataFrame()

# Read the CSV file in chunks
chunksize = 1000
for chunk in pd.read_csv(file_path, chunksize=chunksize, on_bad_lines='skip'):
    data = pd.concat([data, chunk], ignore_index=True)

In [9]:
# Preprocessing
cleaned_data = data.dropna(subset=['commentBody', 'typeOfMaterial'])

# Extracting relevant columns (assuming 'commentBody' as feature and 'typeOfMaterial' as target)
X = cleaned_data['commentBody']
y = cleaned_data['typeOfMaterial']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Text Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_vectorized = vectorizer.fit_transform(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

In [14]:
# Train the model
model = make_pipeline(MaxAbsScaler(), LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

In [15]:
# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

# Print the classification report
print(report)

                 precision    recall  f1-score   support

           Blog       1.00      0.00      0.01       221
          Brief       0.00      0.00      0.00        26
      Editorial       0.61      0.09      0.16      4325
         Letter       0.00      0.00      0.00         4
           News       0.63      0.85      0.72     26177
  News Analysis       0.28      0.01      0.02       730
Obituary (Obit)       0.50      0.02      0.03       133
          Op-Ed       0.59      0.45      0.51     16491
       Question       0.00      0.00      0.00        11
         Review       0.66      0.10      0.18       567
       briefing       0.00      0.00      0.00        82

       accuracy                           0.62     48767
      macro avg       0.39      0.14      0.15     48767
   weighted avg       0.61      0.62      0.57     48767



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
