In [1]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re
import string
import nltk

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.model_selection import train_test_split,StratifiedKFold, ParameterGrid

In [5]:
from sklearn.metrics import classification_report,ConfusionMatrixDisplay, accuracy_score,f1_score

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
from xgboost import XGBClassifier

In [53]:
from sklearn.svm import LinearSVC

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [7]:
import psycopg2

In [8]:
from scipy.sparse import hstack,csr_matrix,issparse

In [9]:
from textblob import TextBlob
import textstat

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nandita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nandita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords

In [14]:
load_dotenv()

True

In [15]:
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

In [16]:
DATABASE_URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

In [17]:
engine = create_engine(DATABASE_URL)

In [18]:
train_df = pd.read_sql("SELECT * FROM train_data", engine)

In [19]:
def clean_texts(col): 
    col = col.lower() 
    col = re.sub(r"\d+", " num ", col)  
    col = re.sub(r"[^\w\s!?]", "", col)  
    col = " ".join([word for word in col.split() if word not in stopwords.words('english')])  
    return col

In [20]:
def convert_to_tfidf_vectorizer(df, text_col, max_features=5000, ngram_range=(1,2)):
    df = df.copy()
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    X_tfidf = vectorizer.fit_transform(df[text_col])
    
    return X_tfidf, vectorizer

In [21]:
def get_train_test(X, y, test_size=0.2, random_state=101):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y   
    )
    return X_train, X_test, y_train, y_test

In [22]:
def combine_features(X_tfidf, engineered_features):
    if not issparse(engineered_features):
        engineered_features = engineered_features.values
    
    return hstack([X_tfidf, engineered_features])

In [23]:
def get_all_feature_names(tfidf_vectorizer, engineered_features):
    tfidf_features = tfidf_vectorizer.get_feature_names_out()
    engineered_features_cols = engineered_features.columns
    return np.concatenate([tfidf_features, engineered_features_cols])

In [24]:
def textblob_features(text):
    blob = TextBlob(text)
    return pd.Series([blob.sentiment.polarity,blob.sentiment.subjectivity])

In [25]:
def get_readability(text):
    flesch = textstat.flesch_reading_ease(text)
    gunning = textstat.gunning_fog(text)
    return pd.Series([flesch,gunning])

In [26]:
def stratified_kfold_eval(X, y, model, n_splits=5):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies, f1_scores = [], []

    for train_idx, val_idx in skf.split(X, y):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)

        accuracies.append(accuracy_score(y_val_fold, y_pred))
        f1_scores.append(f1_score(y_val_fold, y_pred, average="macro"))

    results = {
        "mean_accuracy": np.mean(accuracies),
        "std_accuracy": np.std(accuracies),
        "mean_f1": np.mean(f1_scores),
        "std_f1": np.std(f1_scores)
    }

    return pd.DataFrame([results])


In [27]:
df1 = train_df.copy()

In [28]:
df1.columns

Index(['id', 'subject', 'body', 'text', 'category', 'category_id'], dtype='object')

In [29]:
df1['combined_text_original'] = df1['subject'] + " " + df1['body']+ " " +df1['text']

In [30]:
df1['combined_text_normalized'] = df1['combined_text_original'].astype(str).apply(clean_texts)

In [31]:
df1.head()

Unnamed: 0,id,subject,body,text,category,category_id,combined_text_original,combined_text_normalized
0,promotions_582,Anniversary Special: Buy one get one free,"As our loyal customer, get exclusive $60 off $...",Anniversary Special: Buy one get one free As o...,promotions,1,Anniversary Special: Buy one get one free As o...,anniversary special buy one get one free loyal...
1,spam_1629,Your Amazon was used on new device,Your $5000 refund is processed. Claim: bit.ly/...,Your Amazon was used on new device Your $5000 ...,spam,3,Your Amazon was used on new device Your $5000 ...,amazon used new device num refund processed cl...
2,spam_322,Re: Your Google inquiry,"Hi, following up about your Google application...","Re: Your Google inquiry Hi, following up about...",spam,3,"Re: Your Google inquiry Hi, following up about...",google inquiry hi following google application...
3,social_media_80,Digital Ritual Experience Creation,Cross-cultural ceremony design. Join: virtualr...,Digital Ritual Experience Creation Cross-cultu...,social_media,2,Digital Ritual Experience Creation Cross-cultu...,digital ritual experience creation crosscultur...
4,forum_1351,"Your post was moved to ""Programming Help""","Trending: ""cooking"" (258 comments). View: supp...","Your post was moved to ""Programming Help"" Tren...",forum,0,"Your post was moved to ""Programming Help"" Tren...",post moved programming help trending cooking n...


In [32]:
df1 = df1[['category_id','combined_text_original','combined_text_normalized']]

In [33]:
df1[['polarity','subjectivity']] = df1['combined_text_original'].apply(textblob_features)

In [34]:
df1[['flesch', 'gunning_fog']] = df1['combined_text_original'].apply(get_readability)

In [35]:
df1.head()

Unnamed: 0,category_id,combined_text_original,combined_text_normalized,polarity,subjectivity,flesch,gunning_fog
0,1,Anniversary Special: Buy one get one free As o...,anniversary special buy one get one free loyal...,0.363492,0.734921,44.405,10.0
1,3,Your Amazon was used on new device Your $5000 ...,amazon used new device num refund processed cl...,0.118182,0.427273,68.819265,8.105882
2,3,"Re: Your Google inquiry Hi, following up about...",google inquiry hi following google application...,0.0,0.1,74.3525,7.654545
3,2,Digital Ritual Experience Creation Cross-cultu...,digital ritual experience creation crosscultur...,0.0,0.0,-91.002045,31.290909
4,0,"Your post was moved to ""Programming Help"" Tren...",post moved programming help trending cooking n...,0.0,0.0,50.470769,11.353846


In [36]:
X = df1.drop('category_id',axis=1)

In [37]:
y = df1['category_id']

In [38]:
X_train, X_test, y_train, y_test = get_train_test(X, y, test_size=0.2, random_state=101)

In [39]:
X_tfidf_train,vector = convert_to_tfidf_vectorizer(X_train,'combined_text_normalized',max_features=10000, ngram_range=(2,3))

In [40]:
X_tfidf_test = vector.transform(X_test['combined_text_normalized'])

In [41]:
X_train.drop(['combined_text_original','combined_text_normalized'],axis=1,inplace=True)

In [42]:
X_test.drop(['combined_text_original','combined_text_normalized'],axis=1,inplace=True)

In [43]:
X_input_train = combine_features(X_tfidf_train,X_train)

In [44]:
X_input_test = combine_features(X_tfidf_test,X_test)

In [45]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [46]:
results = stratified_kfold_eval(X_tfidf_train,y_train,rf, n_splits=5)

In [47]:
results

Unnamed: 0,mean_accuracy,std_accuracy,mean_f1,std_f1
0,0.96417,0.001528,0.964546,0.001471


In [48]:
xgb = XGBClassifier(
    use_label_encoder=False,  # avoids warning
    eval_metric="logloss",    # needed for classification
    n_jobs=-1,                # use all CPU cores
    random_state=42
)


In [51]:
results_1 = stratified_kfold_eval(X_tfidf_train,y_train,xgb,n_splits=5)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [52]:
results_1

Unnamed: 0,mean_accuracy,std_accuracy,mean_f1,std_f1
0,0.94956,0.002667,0.950556,0.002625


In [54]:
svc = LinearSVC(random_state=42)

In [55]:
results_2 = stratified_kfold_eval(X_tfidf_train,y_train,svc,n_splits=5)

In [56]:
results_2

Unnamed: 0,mean_accuracy,std_accuracy,mean_f1,std_f1
0,0.970083,0.001745,0.970203,0.001719


In [58]:
mnb = MultinomialNB()

In [59]:
result_3 = stratified_kfold_eval(X_tfidf_train,y_train,mnb,n_splits=5)

In [60]:
result_3

Unnamed: 0,mean_accuracy,std_accuracy,mean_f1,std_f1
0,0.971591,0.001272,0.971655,0.001249
