In [None]:
!pip install fastapi uvicorn nest-asyncio pyngrok spacy transformers
!python -m spacy download en_core_web_sm
!pip install SpeechRecognition
!pip install pydub
!pip install python-dotenv
!pip install python-multipart



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nest_asyncio
import random
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, UploadFile
from pydantic import BaseModel
import spacy
import logging
import speech_recognition as sr
from typing import List
import re
from difflib import SequenceMatcher
from pydub import AudioSegment
import tempfile

# Load spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Define patterns for low-confidence language
LOW_CONFIDENCE_PATTERNS = [
    ("I just think", "Consider removing 'just' to make it more assertive."),
    ("I'm sorry", "Avoid excessive apologizing unless necessary."),
    ("maybe", "Try using more definitive words."),
    ("I could be wrong, but", "Consider removing the disclaimer to sound more confident."),
    ("I think", "Consider removing 'I think' to sound more confident."),
    ("perhaps", "Replace 'perhaps' with a more definitive word."),
    ("I can't do it", "Consider using positive language."),
    ("I'm not sure", "Avoid showing uncertainty. Use a confident statement instead."),
    ("unsure", "Replace with a more assertive phrase."),
    ("totally unsure", "Replace with a confident alternative."),
    ("we should consider", "Use 'we must' or 'we will' for assertiveness.")
]

# Enhanced passive voice detection function
def detect_passive_voice(doc):
    passive_suggestions = []
    for token in doc:
        if token.dep_ == "auxpass" and token.head.pos_ == "VERB":
            agent = [child for child in token.head.children if child.dep_ == "agent"]
            if agent:
                passive_suggestions.append(f"Passive voice detected: '{token.head.text}' with agent '{agent[0].text}'")
            else:
                passive_suggestions.append(f"Passive voice detected: '{token.head.text}'")
    return passive_suggestions

# Analyze text using spaCy and provide suggestions
def analyze_text_spacy(text):
    doc = nlp(text)
    passive_voice_suggestions = detect_passive_voice(doc)
    section_scores = []
    suggestions = []
    confidence_score = 5  # Full confidence to start with

    for sent in doc.sents:
        score = 5
        local_suggestions = []
        for pattern, suggestion in LOW_CONFIDENCE_PATTERNS:
            if re.search(re.escape(pattern), sent.text, re.IGNORECASE):
                local_suggestions.append((pattern, suggestion))
                score -= 1  # Deduct points for each pattern match

        section_scores.append({
            "text": sent.text,
            "confidence_score": score,
            "suggestions": local_suggestions
        })
        suggestions.extend(local_suggestions)

    overall_score = max(1, confidence_score - len(suggestions))
    return {
        "overall_confidence_score": overall_score,
        "section_scores": section_scores,
        "passive_voice_suggestions": passive_voice_suggestions,
        "highlighted_text": [str(ent) for ent in doc.ents],
        "suggestions": suggestions
    }

# Generate synthetic data for ML
def generate_synthetic_data():
    base_phrases = [
        ("I just think", "Underconfident"),
        ("I'm sorry", "Underconfident"),
        ("maybe", "Underconfident"),
        ("I could be wrong, but", "Underconfident"),
        ("I think", "Underconfident"),
        ("perhaps", "Underconfident"),
        ("I'm not sure", "Underconfident"),
        ("unsure", "Underconfident"),
        ("totally unsure", "Underconfident"),
        ("I can't do it", "Underconfident"),
        ("This is a good idea.", "Neutral"),
        ("I am the best", "Neutral"),
        ("I can do it", "Neutral"),
        ("Let's move forward with this strategy.", "Neutral"),
        ("It is possible", "Neutral"),
        ("Responses may delay", "Neutral"),
        ("Data steps for ML analysis", "Neutral")
    ]

    synthetic_data = []
    for phrase, label in base_phrases:
        for _ in range(50):
            random_suffix = f" {random.choice(['!', 'maybe?', 'surely.', 'variation'])}"
            variation = phrase + random_suffix
            synthetic_data.append((variation, label))

    return synthetic_data

# Initial dataset
data = generate_synthetic_data()
df = pd.DataFrame(data, columns=["Text", "Label"])
label_map = {"Underconfident": 0, "Neutral": 1}
df["Label"] = df["Label"].map(label_map)

# Feature extraction and training ML model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Text"])
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Function to classify text using ML model
def classify_text_ml(input_text):
    input_vec = vectorizer.transform([input_text])
    probabilities = model.predict_proba(input_vec)
    reverse_label_map = {v: k for k, v in label_map.items()}
    return {
        "label": reverse_label_map[np.argmax(probabilities)],
        "confidence_scores": {reverse_label_map[i]: prob for i, prob in enumerate(probabilities[0])}
    }

# Save user-submitted data
def save_user_submission(text, label):
    new_data = pd.DataFrame([[text, label]], columns=["Text", "Label"])
    try:
        new_data.to_csv("user_submissions.csv", mode="a", header=False, index=False)
    except Exception as e:
        logging.error(f"Error saving user submission: {str(e)}")

# Retrain the model
def retrain_model():
    """
    Retrains the ML model using new user-submitted data stored in `user_submissions.csv`.

    Raises:
        ValueError: If there is an issue with the input data.
    """
    try:
        # Load new user submissions
        new_data = pd.read_csv("user_submissions.csv", names=["Text", "Label"])

        # Combine the original and new data
        updated_data = pd.concat([df, new_data], ignore_index=True)

        # Ensure labels are properly mapped
        updated_data["Label"] = updated_data["Label"].map(label_map)
        if updated_data["Label"].isnull().any():
            raise ValueError("Invalid labels found in user submissions.")

        # Re-vectorize the combined data
        X = vectorizer.fit_transform(updated_data["Text"])
        y = updated_data["Label"]

        # Retrain the global model
        global model
        model = LogisticRegression()
        model.fit(X, y)

        logging.info("Model retrained successfully.")
    except FileNotFoundError:
        logging.error("User submissions file not found. Retraining skipped.")
        raise ValueError("No user submissions available for retraining.")
    except Exception as e:
        logging.error(f"Error retraining model: {str(e)}")
        raise ValueError(f"Retraining failed: {str(e)}")


# Transcribe audio to text
def convert_audio_to_wav(file):
    temp_dir = tempfile.mkdtemp()
    temp_wav_path = f"{temp_dir}/converted.wav"
    try:
        audio = AudioSegment.from_file(file)
        audio.export(temp_wav_path, format="wav")
        return temp_wav_path
    except Exception as e:
        logging.error(f"Error converting audio: {str(e)}")
        raise ValueError("Error converting audio to WAV format.")

def transcribe_speech(file):
    recognizer = sr.Recognizer()
    try:
        wav_path = convert_audio_to_wav(file)
        with sr.AudioFile(wav_path) as source:
            audio = recognizer.record(source)
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Unable to recognize speech. Please try again."
    except sr.RequestError as e:
        return f"Speech Recognition API unavailable: {e}"
    except Exception as e:
        logging.error(f"Error transcribing audio: {str(e)}")
        return f"Error: {str(e)}"

# FastAPI app definition
app = FastAPI()
user_revisions = {}

class UserInput(BaseModel):
    text: str

@app.post("/classify/")
def classify_text_api(input: UserInput):
    ml_result = classify_text_ml(input.text)
    spacy_result = analyze_text_spacy(input.text)
    return {
        "text": input.text,
        "classification": ml_result,
        "spacy_analysis": spacy_result
    }

@app.post("/transcribe/")
async def transcribe_audio(file: UploadFile):
    try:
        text = transcribe_speech(file.file)
        if "Error" in text:
            return {"error": text}
        spacy_result = analyze_text_spacy(text)
        ml_result = classify_text_ml(text)
        return {
            "text": text,
            "classification": ml_result,
            "spacy_analysis": spacy_result
        }
    except Exception as e:
        logging.error(f"Error in /transcribe/ endpoint: {str(e)}")
        return {"error": f"Failed to process the audio file: {str(e)}"}

@app.post("/submit/")
def submit_text(user_id: str, text: str):
    if user_id not in user_revisions:
        user_revisions[user_id] = []
    previous_text = user_revisions[user_id][-1] if user_revisions[user_id] else ""
    similarity = SequenceMatcher(None, previous_text, text).ratio()
    ml_result = classify_text_ml(text)
    save_user_submission(text, ml_result["label"])
    user_revisions[user_id].append(text)
    return {
        "message": "Text submitted successfully",
        "history": user_revisions[user_id],
        "similarity_with_previous": f"{similarity:.2f}"
    }

@app.post("/retrain/")
def retrain():
    """
    Endpoint to retrain the model using saved user submissions.

    Returns:
        JSON response indicating success or failure of the retraining process.
    """
    try:
        retrain_model()
        return {"message": "Model retrained successfully with new user data."}
    except ValueError as e:
        return {"error": str(e)}





In [None]:
!ngrok config add-authtoken 2q58dFTb2DoAUxBGqQml8NBY3PV_7t2PVEx1y3uXGqriNDSYy

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# Evaluate model performance
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Allow nested asyncio loops
nest_asyncio.apply()

# Set up ngrok
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")

# Start the server
uvicorn.run(app, host="0.0.0.0", port=8000)


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97        95
           1       0.99      0.93      0.96        75

    accuracy                           0.96       170
   macro avg       0.97      0.96      0.96       170
weighted avg       0.97      0.96      0.96       170

Public URL: NgrokTunnel: "https://952e-34-48-147-134.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [16315]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     2003:fe:470b:153a:40a5:1220:45a3:200d:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2003:fe:470b:153a:40a5:1220:45a3:200d:0 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     2003:fe:470b:153a:40a5:1220:45a3:200d:0 - "POST /classify/ HTTP/1.1" 200 OK
INFO:     2003:fe:470b:153a:da1:4c7d:19e2:979d:0 - "POST /classify/ HTTP/1.1" 200 OK
INFO:     2003:fe:470b:153a:da1:4c7d:19e2:979d:0 - "POST /transcribe/ HTTP/1.1" 200 OK
