In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/train.csv')
df.head()
X = df.drop(['post_id','label'], axis=1)
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size = 0.2,
    random_state=42
)
print(X_train.shape[0] + X_val.shape[0])

In [None]:
df['created_date'] = pd.to_datetime(df['created_date'])
month = df['created_date'].dt.month_name()
month.value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = [
    'race', 'religion', 'gender', 'disability'
]


pipeline = ColumnTransformer([
    ('enc', OneHotEncoder(handle_unknown='ignore',sparse_output=False,), categorical_features)],
remainder='passthrough',verbose_feature_names_out=False)
pipeline.set_output(transform='pandas')

final = pipeline.fit_transform(X_train)
final.shape


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_train['comment'] = X_train['comment'].fillna('')
X_val['comment'] = X_val['comment'].fillna('')
x = cv.fit_transform(X_train['comment'])
y = cv.fit_transform(X_val['comment'])
total_count = x[1].sum()
total_count

In [None]:
X_train['disability'] = X_train['disability'].astype(int)
X_val['disability'] = X_val['disability'].astype(int)

xsum = X_train['disability'].sum()
ysum = X_val['disability'].sum()

print(xsum+ysum)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numerical_feature = ['upvote', 'downvote', 'emoticon_1', 'emoticon_2', 'emoticon_3', 'if_1', 'if_2']

impute = ColumnTransformer([
    ('num', StandardScaler(), numerical_feature)
],remainder='passthrough',verbose_feature_names_out=False)

scale = impute.fit(X_train)
n_features = scale.named_transformers_['num'].n_features_in_
print(n_features)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/train.csv')
# df.isna().sum()

df['race'] = df['race'].fillna('none')
df['religion'] = df['religion'].fillna('none')
df['gender'] = df['gender'].fillna('none')
df['comment'] = df['comment'].fillna('')

df['created_date'] = pd.to_datetime(df['created_date'])
df['day'] = df['created_date'].dt.day
df['month'] = df['created_date'].dt.month
df['year'] = df['created_date'].dt.year

X = df.drop(['post_id','label', 'created_date'], axis=1)
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size = 0.2,
    random_state=42
)
y_train = np.ravel(y_train)
y_val = np.ravel(y_val)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=5,
    ngram_range=(1,2)
)

X_train_vctr = vectorizer.fit_transform(X_train['comment'].fillna(''))
X_val_vctr = vectorizer.transform(X_val['comment'].fillna(''))
X_train = X_train.drop(columns=['comment'])
X_val = X_val.drop(columns=['comment'])

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = [
    'race', 'religion', 'gender', 'disability'
]


pipeline = ColumnTransformer([
    ('enc', OneHotEncoder(handle_unknown='ignore',sparse_output=False,), categorical_features)],
remainder='passthrough',verbose_feature_names_out=False)
pipeline.set_output(transform='pandas')
X_train_cat = pipeline.fit_transform(X_train)
X_val_cat = pipeline.transform(X_val)

X_train = X_train.drop(columns=categorical_features, axis=0)
X_val = X_val.drop(columns=categorical_features, axis=0)

In [None]:
from scipy.sparse import hstack

X_train_final = hstack([X_train.values, X_train_cat, X_train_vctr])
X_val_final = hstack([X_val.values, X_val_cat, X_val_vctr])

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_final, y_train)

In [None]:
from sklearn.metrics import f1_score

train_pred = model.predict(X_train_final)
micro_f1 = f1_score(y_train, train_pred, average='macro')
micro_f1

In [None]:
from sklearn.metrics import f1_score

# prediction
y_val_pred = model.predict(X_val_final)

# macro F1 score
val_macro_f1 = f1_score(y_val, y_val_pred, average='macro')

print("Validation Macro F1:", val_macro_f1)


Another

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/train.csv')
X = df.drop(['post_id','label'], axis=1)
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)





In [None]:
from sklearn.impute import SimpleImputer

num_col = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_col = X_train.select_dtypes(include=['object']).columns

num_imp = SimpleImputer(strategy='median')
cat_imp = SimpleImputer(strategy='most_frequent')

X_train[num_col] = num_imp.fit_transform(X_train[num_col])
X_val[num_col] = num_imp.transform(X_val[num_col])

X_train[cat_col] = cat_imp.fit_transform(X_train[cat_col])
X_val[cat_col] = cat_imp.transform(X_val[cat_col])


In [None]:
X_train[num_col] = X_train[num_col].abs()
X_val[num_col] = X_train[num_col].abs()

In [None]:
X_train['created_date'] = pd.to_datetime(X_train['created_date'])
X_val['created_date'] = pd.to_datetime(X_val['created_date'])

X_train['day'] = X_train['created_date'].dt.day
X_train['month'] = X_train['created_date'].dt.month
X_train['year'] = X_train['created_date'].dt.year

X_val['day'] = X_val['created_date'].dt.day
X_val['month'] = X_val['created_date'].dt.month
X_val['year'] = X_val['created_date'].dt.year

X_train.drop(columns=['created_date'], inplace=True)
X_val.drop(columns=['created_date'], inplace=True)