Installations

In [1]:
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install emoji
!pip install tensorflow



Imports

In [2]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

Preprocessing

In [3]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Clean and preprocess text functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()  # Remove leading/trailing whitespace
    return text

def correct_text(text):
    corrections = {
        "aren't": "are not", "can't": "cannot", "couldn't": "could not",
        "didn't": "did not", "doesn't": "does not", "don't": "do not",
        "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
        "he'd": "he would", "he'll": "he will", "he's": "he is",
        "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have",
        "isn't": "is not", "it's": "it is", "let's": "let us",
        "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
        "she'd": "she would", "she'll": "she will", "she's": "she is",
        "shouldn't": "should not", "that's": "that is", "there's": "there is",
        "they'd": "they would", "they'll": "they will", "they're": "they are",
        "they've": "they have", "we'd": "we would", "we're": "we are",
        "weren't": "were not", "we've": "we have", "what'll": "what will",
        "what're": "what are", "what's": "what is", "what've": "what have",
        "where's": "where is", "who'd": "who would", "who'll": "who will",
        "who're": "who are", "who's": "who is", "who've": "who have",
        "won't": "will not", "wouldn't": "would not", "you'd": "you would",
        "you'll": "you will", "you're": "you are", "you've": "you have"
    }
    for word, replacement in corrections.items():
        text = text.replace(word, replacement)
    return text

def preprocess_text(text):
    if pd.isnull(text):
        return ''
    text = clean_text(text)
    text = correct_text(text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

def handle_emojis(text):
    return emoji.demojize(text)

def preprocess_dataframe(df, text_column):
    if df[text_column].isnull().sum() > 0:
        print(f"Found {df[text_column].isnull().sum()} NaN values in the {text_column} column. Replacing with empty strings.")
        df[text_column] = df[text_column].fillna('')

    df[text_column] = df[text_column].apply(handle_emojis)
    df[text_column] = df[text_column].apply(preprocess_text)
    return df

def preprocess_datasets(train_path, test_path):
    train_df = pd.read_csv('/content/train.csv')
    test_df = pd.read_csv('/content/test.csv')

    train_df = preprocess_dataframe(train_df, 'text')
    test_df = preprocess_dataframe(test_df, 'text')

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_df['text']).toarray()
    X_test = vectorizer.transform(test_df['text']).toarray()

    train_df.to_csv('preprocessed_train.csv', index=False)    #Change index to True if u don't want the file to be downloaded
    test_df.to_csv('preprocessed_test.csv', index=False)    #Change index to True if u don't want the file to be downloaded

    return X_train, X_test, train_df, test_df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preparing for the models

In [4]:
train_path = '/content/train.csv'  # Update with actual path to your train.csv
test_path = '/content/test.csv'    # Update with actual path to your test.csv

X_train, X_test, train_df, test_df = preprocess_datasets(train_path, test_path)

print(f"The shape of the TF-IDF matrix for training data is: {X_train.shape}")
print(f"The shape of the TF-IDF matrix for test data is: {X_test.shape}")

# Train-test split for evaluation
X_train_eval, X_val_eval, y_train_eval, y_val_eval = train_test_split(X_train, train_df['target'], test_size=0.2, random_state=42)

The shape of the TF-IDF matrix for training data is: (7613, 5000)
The shape of the TF-IDF matrix for test data is: (3263, 5000)


Model 1: Logistic Regression

In [5]:
# Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_eval, y_train_eval)
y_pred_log_reg = log_reg.predict(X_val_eval)
print("Logistic Regression Performance:")
print(classification_report(y_val_eval, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_val_eval, y_pred_log_reg))

# Make predictions on the test data
test_predictions_log_reg = log_reg.predict(X_test)

# Prepare submission file
submission_df_log_reg = pd.DataFrame({'id': test_df['id'], 'target': test_predictions_log_reg})
submission_df_log_reg.to_csv('Submission_LogReg.csv', index=False)    #Change index to True if u don't want the file to be downloaded


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.83      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Accuracy: 0.8010505581089954


Model 2: Random Forest

In [6]:
# Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_eval, y_train_eval)
y_pred_rf = rf.predict(X_val_eval)
print("Random Forest Performance:")
print(classification_report(y_val_eval, y_pred_rf))
print("Accuracy:", accuracy_score(y_val_eval, y_pred_rf))

# Prepare test data and make predictions
test_predictions_rf = rf.predict(X_test)

# Prepare submission file
submission_df_rf = pd.DataFrame({'id': test_df['id'], 'target': test_predictions_rf})
submission_df_rf.to_csv('Submission_RF.csv', index=False)     #Change index to True if u don't want the file to be downloaded


Random Forest Performance:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       874
           1       0.74      0.69      0.71       649

    accuracy                           0.77      1523
   macro avg       0.76      0.76      0.76      1523
weighted avg       0.77      0.77      0.76      1523

Accuracy: 0.7662508207485227


Model 3: LSTM

In [7]:
# LSTM model
# Tokenizer and sequence padding for LSTM
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['text'])
X_seq = tokenizer.texts_to_sequences(train_df['text'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Train-test split for LSTM
X_train_pad, X_val_pad, y_train_pad, y_val_pad = train_test_split(X_pad, train_df['target'], test_size=0.2, random_state=42)

# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=100, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

# Compile and train LSTM model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train_pad, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val_pad), verbose=2)

# Evaluate LSTM model
y_pred_lstm = (lstm_model.predict(X_val_pad) > 0.5).astype("int32")
print("LSTM Model Performance:")
print(classification_report(y_val_pad, y_pred_lstm))
print("Accuracy:", accuracy_score(y_val_pad, y_pred_lstm))

# Prepare test data and make predictions
test_seq = tokenizer.texts_to_sequences(test_df['text'])
test_pad = pad_sequences(test_seq, maxlen=100)
test_predictions = (lstm_model.predict(test_pad) > 0.5).astype("int32")

# Prepare submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'target': test_predictions.ravel()})
submission_df.to_csv('Submission_LSTM.csv', index=False)      #Change index to True if u don't want the file to be downloaded

Epoch 1/5
96/96 - 39s - loss: 0.5923 - accuracy: 0.6808 - val_loss: 0.4586 - val_accuracy: 0.7991 - 39s/epoch - 401ms/step
Epoch 2/5
96/96 - 30s - loss: 0.3787 - accuracy: 0.8376 - val_loss: 0.4688 - val_accuracy: 0.7879 - 30s/epoch - 316ms/step
Epoch 3/5
96/96 - 33s - loss: 0.3008 - accuracy: 0.8762 - val_loss: 0.5136 - val_accuracy: 0.7682 - 33s/epoch - 345ms/step
Epoch 4/5
96/96 - 32s - loss: 0.2483 - accuracy: 0.9021 - val_loss: 0.6009 - val_accuracy: 0.7656 - 32s/epoch - 338ms/step
Epoch 5/5
96/96 - 45s - loss: 0.2104 - accuracy: 0.9181 - val_loss: 0.6126 - val_accuracy: 0.7597 - 45s/epoch - 473ms/step
LSTM Model Performance:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       874
           1       0.72      0.70      0.71       649

    accuracy                           0.76      1523
   macro avg       0.75      0.75      0.75      1523
weighted avg       0.76      0.76      0.76      1523

Accuracy: 0.7596848325673013
