In [None]:
!python --version

Python 3.11.12


In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import string
import re
from ast import literal_eval
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ft_model = fasttext.load_model("cc.en.300.bin")


In [None]:
def to_lower(text):
    """Converts text to lowercase."""
    return text.lower()

In [None]:
exclude = string.punctuation
def removePunctuation(text):
    return text.translate(str.maketrans('','',exclude))

In [None]:


import urllib.request
url = 'https://github.com/shad-datascience/ML_Projects/blob/main/stop_hinglish.txt'
file_Path = 'hinglish_text'
urllib.request.urlretrieve(url, file_Path)


# Download NLTK stop words (if not already downloaded)
nltk.download("stopwords")

# Load Hinglish stop words from file
def load_stop_words(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        stop_words = set(word.strip().lower() for word in file.readlines())  # Normalize words
    return stop_words

# Load Hinglish stop words
# hinglish_stop_words = load_stop_words("/content/drive/MyDrive/Dataset/stop_hinglish.txt")

hinglish_stop_words = load_stop_words("hinglish_text")

# Load English stop words from NLTK
english_stop_words = set(stopwords.words("english"))

# Combine both stop words lists
all_stop_words = hinglish_stop_words.union(english_stop_words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# Function to remove stop words (Hinglish + English)
def remove_stop_words(text):
    if isinstance(text, str):  # Ensure input is a string
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in all_stop_words]
        return " ".join(filtered_words)
    return text  # Return original if not a string (handles NaN values)


In [None]:
slang_dict = {
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "afaik": "as far as I know",
    "bcoz": "because",
    "frnd": "friend",
    "yaar": "friend",
    "mast": "awesome",
    "jhakaas": "superb",
    "sahi": "great",
    "bhai": "brother",
    "bro": "brother",
    "dost": "friend",
    "paka": "sure",
    "nai": "no",
    "koi nahi": "no one",
    "jldi": "jaldi",
    "aalsi": "lazy",
    "pakka": "sure",
    "biryani": "amazing",
    "scene hai": "there is a situation",
    "tight": "intoxicated",
    "lag gaye": "we are in trouble",
    "fix hai": "it is certain",
    "chill maar": "relax",
    "rapchik": "cool",
    "fadu": "amazing",
    "senti": "emotional",
    "jhakkas": "amazing",
    "kadak": "strong",
    "bindaas": "carefree",
    "haanikarak": "dangerous",
    "kaand": "big trouble",
    "faltu": "useless",
    "bhasad": "mess",
    "mamu": "dude",
    "tera kya scene hai?": "what's your plan?",
    "lafda": "problem",
    "locha": "issue",
    "jumla": "false promise",
    "khopdi tod": "mind-blowing",
    "chep": "clingy person",
    "lukkha": "useless guy",
    "matlab": "meaning",
    "chalu": "smart",
    "bawaal": "chaotic",
    "att": "attitude",
    "op": "overpowered",
    "hatt": "move away",
    "sahi hai": "it's good",
    "lit": "amazing",
    "supari": "contract killing",
    "ragra": "beaten badly",
    "maal": "attractive person",
    "item": "hot girl",
    "pataka": "attractive girl",
    "set hai": "everything is fine",
    "chindi": "cheap",
    "beedu": "close friend",
    "kat gaya": "got tricked",
    "tatti": "bad",
    "bakwaas": "nonsense",
    "scene on hai": "things are happening",
    "scene off hai": "not happening",
    "fix hai": "certain",
    "trip maar": "enjoy",
    "chhapri": "wannabe",
    "bhaiya": "elder brother",
}


In [None]:
def expand_slang(text):
    words = text.split()
    expanded_words = [slang_dict.get(word.lower(), word) for word in words]  # Replace slang
    return " ".join(expanded_words)

In [None]:
def preprocessing(text):
    lower = to_lower(text)
    rem_punct = removePunctuation(lower)
    rem_stop = remove_stop_words(rem_punct)
    text = expand_slang(rem_stop)
    return text

In [None]:
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlmr_model = AutoModel.from_pretrained("xlm-roberta-base")

def get_xlmr_embedding(text):
    tokens = xlmr_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = xlmr_model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


In [None]:
#sentiment pipeline
sentiment_analyzer = VS()

def sentiments(tweet):
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    features = [sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound']]
    #features = pandas.DataFrame(features)
    return features


In [None]:
def expand_ndarray_series(series):
    """Expands a pandas Series containing ndarray values into a DataFrame with separate columns."""
    array_data = np.vstack(series.values)
    expanded_columns = [f"feature_{i}" for i in range(array_data.shape[1])]
    return pd.DataFrame(array_data, columns=expanded_columns)

In [None]:
import re
import nltk
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class PreprocessingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.Series(X)
        return X.apply(preprocessing)

class XLMREmbeddingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(get_xlmr_embedding)

class ExpandNDArrayTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return expand_ndarray_series(X)

pipeline1 = Pipeline([
    ('preprocessing', PreprocessingTransformer()),
    ('embedding', XLMREmbeddingTransformer()),
    ('expand', ExpandNDArrayTransformer())
])

In [None]:
import re
import nltk
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class PreprocessingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.Series(X)
        return X.apply(preprocessing)

class SentimentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(sentiments)

class ExpandNDArrayTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return expand_ndarray_series(X)

pipeline2 = Pipeline([
    ('preprocessing', PreprocessingTransformer()),
    ('sentiments',SentimentTransformer()),
    ('expand', ExpandNDArrayTransformer())
])


In [None]:
train = pd.read_csv("/content/drive/MyDrive/Hackathon/train.csv")

y_train = train['Label']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(['YES','NO'])
y_train = le.transform(y_train)
y_train


array([0, 1, 1, ..., 1, 1, 1])

In [None]:
valid = pd.read_csv("/content/drive/MyDrive/Hackathon/test.csv")

y_valid = valid['Label']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(['YES','NO'])
y_valid = le.transform(y_valid)
y_valid



array([1, 1, 1, ..., 1, 1, 0])

In [None]:
from sklearn.pipeline import Pipeline,FeatureUnion

# Concatenation of both pipelines
class DataFrameConcatenator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.concat(X, axis=1)

# Defining the feature processing pipeline
feature_pipeline = FeatureUnion([
    ("pipeline1", pipeline1),
    ("pipeline2", pipeline2)
])

# from sklearn.neural_network import MLPClassifier

# mlp_sklearn = MLPClassifier(
#     hidden_layer_sizes=(512, 256, 128, 64),  # Equivalent to your Keras hidden layers
#     activation='relu',   # ReLU activation in all hidden layers
#     solver='adam',       # Adam optimizer
#     alpha=0.0001,        # L2 regularization (to help generalization, as dropout is missing)
#     learning_rate='adaptive',  # Adjusts learning rate dynamically
#     max_iter=500,        # Number of iterations (epochs)
#     random_state=42
# )


In [None]:
import cloudpickle
# Save the model
with open("/content/drive/MyDrive/Hackathon/Hackarena/feature_pipeline.pkl", "wb") as f:
    cloudpickle.dump(feature_pipeline, f)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train = feature_pipeline.transform(train['Tweet'])
X_valid = feature_pipeline.transform(valid['Tweet'])

# Define the MLP model
def create_mlp():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer
        tf.keras.layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(256, activation='relu'),  # Second hidden layer
        tf.keras.layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(128, activation='relu'),  # Third hidden layer
        tf.keras.layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(64, activation='relu'),  # fourth hidden layer
        tf.keras.layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(1, activation='sigmoid')  # Output layer (binary classification)
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Create model
mlp_model = create_mlp()

# Define Early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,  # Stop training if no improvement for 5 epochs
    restore_best_weights=True,  # Restore the best weights when stopping
    verbose=1
)


# Train the model with Early Stopping
mlp_model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stopping],  # Apply Early Stopping
    verbose=1
)

# Evaluate the model
test_loss, test_acc = mlp_model.evaluate(X_valid, y_valid, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 42ms/step - accuracy: 0.8752 - loss: 0.3451 - val_accuracy: 0.9625 - val_loss: 0.2008
Epoch 2/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9588 - loss: 0.1188 - val_accuracy: 0.9559 - val_loss: 0.0996
Epoch 3/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9628 - loss: 0.0966 - val_accuracy: 0.9782 - val_loss: 0.0623
Epoch 4/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9686 - loss: 0.0773 - val_accuracy: 0.9597 - val_loss: 0.1319
Epoch 5/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9692 - loss: 0.0762 - val_accuracy: 0.9777 - val_loss: 0.0506
Epoch 6/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9697 - loss: 0.0785 - val_accuracy: 0.9787 - val_loss: 0.0523
Epoch 7/100
[1m154/

In [None]:
mlp_model.save('/content/drive/MyDrive/Hackathon/Hackarena/mlp_model.keras')

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import pandas as pd
import numpy as np
import string
import re
from ast import literal_eval
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
import re
import nltk
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
from tensorflow import keras
import torch

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
!pip install VaderSentiment

Collecting VaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: VaderSentiment
Successfully installed VaderSentiment-3.3.2


In [6]:
import joblib
from tensorflow import keras

model = keras.models.load_model("/content/drive/MyDrive/Hackathon/Hackarena/mlp_model.keras")
pipeline = joblib.load("/content/drive/MyDrive/Hackathon/Hackarena/feature_pipeline.pkl")

text_input= "beta tum to bade heavy driver ho!!"
processed_input = pipeline.transform(text_input)

model.predict(processed_input)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 604ms/step


array([[0.9944623]], dtype=float32)

'1.4.2'