In [1]:
import pandas as pd
import numpy as np

In [2]:
import re
def pre_process(text):
  url_pattern="https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F\][0-9a-fA-F]))+"
  non_ascii_pattern="[^\x00-\x7F]+"
  url_matches=re.findall(url_pattern,text)
  non_ascii_matches=re.findall(non_ascii_pattern,text)
  text=text.lower()
  text=text.replace("\n"," ")
  text=text.strip()
  if url_matches:
    for url in url_matches:
      text=text.replace(url,"")
  if non_ascii_matches:
    for non_ascii in non_ascii_matches:
      text=text.replace(non_ascii,"")
  text=text.strip()
  if len(text.split())<=2:
    return np.nan
  return text

  url_pattern="https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F\][0-9a-fA-F]))+"


In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
lemm=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/keertan.patro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keertan.patro/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
df=pd.read_csv("https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv")
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [7]:
stopwords=[word for word in stopwords.words('english') if word not in ('not','but','however','no','yet')]

In [8]:
def remove_stopwords(text):
  new_text=""
  for word in text.split():
    if word not in stopwords:
      word=lemm.lemmatize(word)
      new_text+=word+" "
  return new_text.strip()

In [9]:
df['new_clean_comment']=df['clean_comment'].apply(pre_process)
df.dropna(inplace=True)
df['processed_comment']=df['new_clean_comment'].apply(remove_stopwords)

In [10]:
def get_max_length(df):
  max_length=-float('inf')
  for text in df['processed_comment']:
    text_len=len(text.split())
    if text_len>max_length:
      max_length=text_len
  return max_length
max_length=get_max_length(df)

In [11]:
max_length

893

In [12]:
df['new_category']=df['category'].apply(lambda x: x if x in (1,0) else 2 )

In [13]:
df['new_category'].value_counts()

Unnamed: 0_level_0,count
new_category,Unnamed: 1_level_1
1,15446
0,11005
2,8066


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import dataset,DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
tokenizer=Tokenizer(num_words=2000,lower=True)
tokenizer.fit_on_texts(df['processed_comment'])

In [16]:
vocab=tokenizer.word_index
len(vocab)
max_len=600
X=tokenizer.texts_to_sequences(df['processed_comment'])
X=pad_sequences(X,maxlen=max_len)
y=np.array(df['new_category'])

In [17]:
device="cuda" if torch.cuda.is_available else "cpu"

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True,stratify=y)
X_train=torch.tensor(X_train,dtype=torch.long).to(device)
X_test=torch.tensor(X_test,dtype=torch.long).to(device)
y_train=torch.tensor(y_train,dtype=torch.long).to(device)
y_test=torch.tensor(y_test,dtype=torch.long).to(device)
train_dataset=TensorDataset(X_train,y_train)
test_dataset=TensorDataset(X_test,y_test)

In [19]:
batch_size=4
output_dim=3
train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_dataloader=DataLoader(test_dataset,batch_size=batch_size,shuffle=True)

In [20]:
class Lstm_model(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_dim):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.lstm=nn.LSTM(embedding_dim,hidden_dim,batch_first=True)
    self.linear=nn.Linear(hidden_dim,output_dim)

  def forward(self,x):
    x=self.embedding(x)
    output,hidden=self.lstm(x)
    x=hidden[-1]
    y=self.linear(x)
    return y

In [21]:
print(torch.version.cuda)

12.6


In [22]:
model=Lstm_model(len(vocab),128,64).to(device)

In [23]:
model

Lstm_model(
  (embedding): Embedding(47894, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=3, bias=True)
)

In [24]:
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.0001)

In [25]:
from tqdm import tqdm

In [29]:
model.train()
for epoch in range(25):
  total_loss=0
  for x,y in tqdm(train_dataloader):
    y_pred=model(x)
    y_pred=y_pred.squeeze()
    batch_size=y.shape[0]
    y_pred=y_pred.view(batch_size,output_dim)
    loss=criterion(y_pred,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss+=loss.item()
  print(f"Loss for epoch-{epoch} is", total_loss)

100%|██████████| 6904/6904 [00:29<00:00, 230.74it/s]


Loss for epoch-0 is 6083.170913055539


100%|██████████| 6904/6904 [00:29<00:00, 235.48it/s]


Loss for epoch-1 is 4942.615423664451


100%|██████████| 6904/6904 [00:29<00:00, 235.18it/s]


Loss for epoch-2 is 4267.136095248163


100%|██████████| 6904/6904 [00:29<00:00, 237.66it/s]


Loss for epoch-3 is 3753.1732354890555


100%|██████████| 6904/6904 [00:29<00:00, 234.80it/s]


Loss for epoch-4 is 3341.2905222661793


100%|██████████| 6904/6904 [00:29<00:00, 236.60it/s]


Loss for epoch-5 is 3027.709769929177


100%|██████████| 6904/6904 [00:29<00:00, 234.65it/s]


Loss for epoch-6 is 2748.206438484136


100%|██████████| 6904/6904 [00:29<00:00, 237.05it/s]


Loss for epoch-7 is 2525.3235207990947


100%|██████████| 6904/6904 [00:29<00:00, 235.26it/s]


Loss for epoch-8 is 2322.5999262540718


100%|██████████| 6904/6904 [00:29<00:00, 235.25it/s]


Loss for epoch-9 is 2122.8548651530728


100%|██████████| 6904/6904 [00:29<00:00, 237.32it/s]


Loss for epoch-10 is 1960.972931058961


100%|██████████| 6904/6904 [00:29<00:00, 235.02it/s]


Loss for epoch-11 is 1793.53442625577


100%|██████████| 6904/6904 [00:29<00:00, 237.10it/s]


Loss for epoch-12 is 1640.833538791434


100%|██████████| 6904/6904 [00:29<00:00, 234.74it/s]


Loss for epoch-13 is 1510.4112255680502


100%|██████████| 6904/6904 [00:29<00:00, 237.01it/s]


Loss for epoch-14 is 1378.6962181113058


100%|██████████| 6904/6904 [00:29<00:00, 234.41it/s]


Loss for epoch-15 is 1252.1832484414044


100%|██████████| 6904/6904 [00:29<00:00, 236.60it/s]


Loss for epoch-16 is 1141.0222020679387


100%|██████████| 6904/6904 [00:29<00:00, 237.47it/s]


Loss for epoch-17 is 1059.8301172467282


100%|██████████| 6904/6904 [00:29<00:00, 234.78it/s]


Loss for epoch-18 is 969.9372412134246


100%|██████████| 6904/6904 [00:29<00:00, 237.04it/s]


Loss for epoch-19 is 904.4752987639486


100%|██████████| 6904/6904 [00:29<00:00, 233.90it/s]


Loss for epoch-20 is 866.3140132331654


100%|██████████| 6904/6904 [00:29<00:00, 234.99it/s]


Loss for epoch-21 is 755.9516757172339


100%|██████████| 6904/6904 [00:29<00:00, 234.52it/s]


Loss for epoch-22 is 716.3786943701031


100%|██████████| 6904/6904 [00:29<00:00, 236.98it/s]


Loss for epoch-23 is 662.1217200150322


100%|██████████| 6904/6904 [00:29<00:00, 236.53it/s]

Loss for epoch-24 is 603.8883051154452





In [30]:
def evaluate_model(model,test_dataloader):
  model.eval()
  y_pred_all=[]
  y_true_all=[]
  for x,y in test_dataloader:
    y_pred=model(x)
    y_pred=torch.max(y_pred,-1).indices.view(y.shape[0])
    if y_pred.device.type=="cuda":
      y_pred=y_pred.to("cpu").tolist()
      y=y.to("cpu").tolist()
      y_pred_all.extend(y_pred)
      y_true_all.extend(y)
    elif y_pred.device.type=="cpu":
      y_pred=y_pred.tolist()[0]
      y_pred_all.extend(y_pred)
      y=y.tolist()
      y_true_all.extend(y)
  report=classification_report(y_true_all,y_pred_all,output_dict=True)
  return report,y_true_all,y_pred_all

In [25]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.40.13-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.41.0,>=1.40.13 (from boto3)
  Downloading botocore-1.40.13-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.40.13-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.1/140.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.40.13-py3-none-any.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m126.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m8.7 MB/s[0m eta [36m0

In [26]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.3.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.0 (from mlflow)
  Downloading mlflow_skinny-3.3.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.0 (from mlflow)
  Downloading mlflow_tracing-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.3.0->mlflow)
  Downloading databricks_sdk-0.64.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.3.0->mlflow)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl

In [27]:
! pip install awscli

Collecting awscli
  Downloading awscli-1.42.13-py3-none-any.whl.metadata (11 kB)
Collecting docutils<=0.19,>=0.18.1 (from awscli)
  Downloading docutils-0.19-py3-none-any.whl.metadata (2.7 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli)
  Downloading rsa-4.7.2-py3-none-any.whl.metadata (3.6 kB)
Downloading awscli-1.42.13-py3-none-any.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading docutils-0.19-py3-none-any.whl (570 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rsa-4.7.2-py3-none-any.whl (34 kB)
Installing collected packages: rsa, docutils, colorama, awscli
  Attempting uninstall: rsa
    Found existing installation: rsa 4.9.1


In [31]:
from sklearn.metrics import accuracy_score,recall_score,classification_report
import mlflow
import boto3
mlflow.set_tracking_uri("http://ec2-54-211-106-118.compute-1.amazonaws.com:5000/")

In [32]:
def store_best_model_lstm(model,test_dataloader):
  with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "LSTM")
    mlflow.set_tag("experiment_type", "algorithm_comparison")
    class_report,y_true_all,y_pred_all=evaluate_model(model,test_dataloader)
    print("class report:",class_report)
    for metric in class_report:
      mlflow.log_param("Model","LSTM")
      for metric in class_report:
        if type(class_report[metric])==dict:
          for key in class_report[metric]:
            if key!='support':
              mlflow.log_metric(f"{metric}_{key}",class_report[metric][key])
        else:
          mlflow.log_metric(metric,class_report[metric])
    mlflow.pytorch.log_model(model, "Lstm model")

In [33]:
store_best_model_lstm(model,test_dataloader)

class report: {'0': {'precision': 0.8048128342245989, 'recall': 0.8205361199454794, 'f1-score': 0.8125984251968504, 'support': 2201.0}, '1': {'precision': 0.7687981053878035, 'recall': 0.8404530744336569, 'f1-score': 0.803030303030303, 'support': 3090.0}, '2': {'precision': 0.7238689547581904, 'recall': 0.5753254804711717, 'f1-score': 0.6411053540587219, 'support': 1613.0}, 'accuracy': 0.7721610660486674, 'macro avg': {'precision': 0.7658266314568642, 'recall': 0.7454382249501026, 'f1-score': 0.7522446940952917, 'support': 6904.0}, 'weighted avg': {'precision': 0.7697827082563174, 'recall': 0.7721610660486674, 'f1-score': 0.7682496677750034, 'support': 6904.0}}




🏃 View run LSTM at: http://ec2-54-211-106-118.compute-1.amazonaws.com:5000/#/experiments/0/runs/17165c9c2fa74fe09e99c03a53d415fe
🧪 View experiment at: http://ec2-54-211-106-118.compute-1.amazonaws.com:5000/#/experiments/0
