Step 1:- Mount Google Drive to access files stored in your Drive (e.g., datasets)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Step 2:- Import nltk library and download 'wordnet' corpus needed for lemmatization

In [41]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Step 3:- Install required Python packages using pip

In [42]:
!pip install streamlit
!pip install pyngrok
!pip install streamlit pandas scikit-learn nltk openpyxl
!pip install fuzzywuzzy python-Levenshtein



Step 4:- Fetch and print your public IP address (optional, useful to check network)

In [44]:
!wget -q -O - ipv4.icanhazip.com

34.148.57.207


Step 5:- Run the app.py

In [43]:
%%writefile app.py
import re
import streamlit as st
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

nltk.download('wordnet')

# Load dataset
@st.cache_data
def load_dataset(filepath):
    df = pd.read_excel(filepath)
    return df

# Preprocess dataset
def preprocess_data(df):
    X = df['Slang/Idiom'].astype(str)
    y = df['Category'].astype(str)
    return X, y

# Train classification model
@st.cache_data(show_spinner=False)
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    model = RandomForestClassifier(random_state=42, class_weight="balanced")
    model.fit(X_train_tfidf, y_train)
    return model, vectorizer, X_test_tfidf, y_test

# General Marathi stopwords/filler words to ignore
stopwords = {
    "मग", "अरे", "हो", "का", "आहे", "नाही", "मी", "तू", "तो", "ती", "आणि",
    "पण", "तर", "ही", "तोच", "किंवा", "कुठे", "कधी", "कशाला", "कसा", "कशी"
}

# Function to ignore non-slang/idiom phrases
def is_non_slang_phrase(phrase):
    if len(phrase.split()) <= 2:
        return True
    words = phrase.split()
    if all(word in stopwords for word in words):
        return True
    question_words = {"का", "कधी", "कुठे", "कशाला", "कसा", "कशी"}
    interjections = {"अरे", "हे", "ओ", "अहो"}
    first_word = words[0]
    if first_word in question_words or first_word in interjections:
        return True
    return False

# Classify slangs and idioms
def classify_slangs_and_idioms(text, model, vectorizer, slang_df, confidence_threshold=0.6):
    lemmatizer = WordNetLemmatizer()
    sentences = re.split(r'[.?!,;"“”]+', text)
    all_detected = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        if is_non_slang_phrase(sentence):
            continue

        lemmatized_sentence = ' '.join([lemmatizer.lemmatize(word) for word in sentence.split()])
        sentence_tfidf = vectorizer.transform([lemmatized_sentence])

        proba = model.predict_proba(sentence_tfidf)[0]
        max_confidence = max(proba)
        prediction = model.classes_[proba.argmax()]

        if max_confidence < confidence_threshold:
            continue

        # Check against dataset slangs/idioms
        for slang_idiom, meaning in zip(slang_df['Slang/Idiom'], slang_df['Meaning']):
            lemmatized_slang = ' '.join([lemmatizer.lemmatize(word) for word in slang_idiom.split()])
            if lemmatized_slang in lemmatized_sentence and slang_idiom not in all_detected:
                all_detected[slang_idiom] = {
                    'type': prediction,
                    'slang_idiom': slang_idiom,
                    'meaning': meaning
                }

        # Check manual dictionary with fuzzy matching (>80% partial ratio)
        for manual_key, manual_value in manual_slang_idioms.items():
            if fuzz.partial_ratio(manual_key, sentence) > 80 and manual_key not in all_detected:
                all_detected[manual_key] = {
                    'type': manual_value['type'],
                    'slang_idiom': manual_key,
                    'meaning': manual_value['meaning']
                }

    return list(all_detected.values())

# Evaluate model
def evaluate_model(model, X_test_tfidf, y_test):
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, conf_matrix

def main():
    st.set_page_config(page_title="Marathi Slang/Idiom Detector", layout="wide", page_icon="🔍")

    # Custom CSS Styling
    st.markdown("""
        <style>
            body {
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                background-color: #f5f7fa;
            }
            .stButton>button {
                background-color: #00adb5;
                color: white;
                font-weight: bold;
                border-radius: 8px;
                height: 3em;
                width: 100%;
                font-size: 16px;
                margin-top: 10px;
            }
            .stTextArea>div>textarea {
                border-radius: 10px;
                border: 1px solid #ccc;
                font-size: 16px;
                padding: 12px;
            }
            .block-container {
                padding: 2rem 3rem;
            }
            h1, h2, h3 {
                color: #007bff;
            }
            footer {visibility: hidden;}
        </style>
    """, unsafe_allow_html=True)

    # Sidebar
    with st.sidebar:
        st.markdown("<h2 style='color: #007bff;'>Marathi Slang & Idiom Detection App</h2>", unsafe_allow_html=True)
        st.write("🚀 Detects Marathi slangs and idioms and provides their meanings.")
        st.markdown("---")
        st.header("Instructions")
        st.write(
            "1. Enter a Marathi paragraph.\n"
            "2. Click **Classify** to detect slangs/idioms.\n"
            "3. Click **Evaluate Model** to see model performance."
        )

    # Main Title and Description
    st.markdown("<h1 style='text-align: center;'>Marathi Slang/Idiom Detection 🔍</h1>", unsafe_allow_html=True)
    st.write("This app identifies Marathi slangs and idioms in your text and provides their meanings if available.")
    st.markdown("---")

    # Load dataset and train model once
    filepath = '/content/drive/MyDrive/NLP Mini Project Dataset/Marathi Slang And Idiom.xlsx'
    slang_df = load_dataset(filepath)
    X, y = preprocess_data(slang_df)
    model, vectorizer, X_test_tfidf, y_test = train_model(X, y)

    # Input and buttons in columns for neat layout
    col1, col2 = st.columns([3, 1])

    with col1:
        st.markdown("<h3>Enter a Marathi Paragraph:</h3>", unsafe_allow_html=True)
        user_input = st.text_area("Paste your Marathi text here:")

    with col2:
        st.markdown("<h3>Actions</h3>", unsafe_allow_html=True)
        classify_clicked = st.button("🔍 Classify")
        evaluate_clicked = st.button("📈 Evaluate Model")

    # Handle classification
    if classify_clicked:
        if user_input and user_input.strip():
            detected_items = classify_slangs_and_idioms(user_input, model, vectorizer, slang_df)
            if detected_items:
                st.markdown("### Recognized Slangs/Idioms:")
                for item in detected_items:
                    st.success(f"✅ **{item['type'].capitalize()}** → *{item['slang_idiom']}* ➡️ {item['meaning']}")
            else:
                st.warning("⚠️ No slangs or idioms identified.")
        else:
            st.error("⚠️ Please enter valid text.")

    # Handle evaluation
    if evaluate_clicked:
        accuracy, precision, recall, conf_matrix = evaluate_model(model, X_test_tfidf, y_test)
        st.markdown("### Model Performance Metrics 📊")
        st.metric(label="Accuracy", value=f"{accuracy * 100:.2f}%")
        st.metric(label="Precision", value=f"{precision:.2f}")
        st.metric(label="Recall", value=f"{recall:.2f}")

        # Plot confusion matrix heatmap
        st.markdown("### Confusion Matrix")
        fig, ax = plt.subplots(figsize=(4, 3))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        st.pyplot(fig)

if __name__ == '__main__':
    main()

Overwriting app.py


Step 6:- Run Streamlit app in background and expose it publicly using localtunnel on port 8501

In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.148.57.207:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://cute-knives-show.loca.lt
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
