In [None]:
import pandas as pd
import random

# Define possible labels
labels = ["Objection", "Pricing Discussion", "Security", "Competition", "Positive"]

# Define some common phrases for each label
label_phrases = {
    "Objection": [
        "The customer raised an objection about",
        "They expressed concerns regarding",
        "There was pushback on",
        "They were hesitant about",
    ],
    "Pricing Discussion": [
        "They asked about the pricing model",
        "The discussion revolved around the cost",
        "They inquired about discounts",
        "The budget was a key concern",
    ],
    "Security": [
        "They asked about SOC2 certification",
        "Data handling was a major concern",
        "They wanted to know about compliance",
        "Security protocols were discussed",
    ],
    "Competition": [
        "They mentioned CompetitorX as an alternative",
        "CompetitorY was brought up during the call",
        "They compared the product to CompetitorZ",
        "CompetitorX was highlighted as a cheaper option",
    ],
    "Positive": [
        "They appreciated the analytics feature",
        "The AI engine was praised",
        "They were impressed with the data pipeline",
        "They loved the product's ease of use",
    ],
}

# Generate synthetic data
data = []
for i in range(1, 101):  # Generate 100 rows
    selected_labels = random.sample(labels, k=random.randint(1, 3))  # Randomly select 1-3 labels
    text_snippet = ""

    # Build the text snippet based on selected labels
    for label in selected_labels:
        text_snippet += random.choice(label_phrases[label]) + " "

    # Add the row to the dataset
    data.append({
        "id": i,
        "text_snippet": text_snippet.strip(),
        "labels": ", ".join(selected_labels)
    })

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("calls_dataset.csv", index=False)
print("Synthetic dataset generated and saved as 'calls_dataset.csv'.")

Synthetic dataset generated and saved as 'calls_dataset.csv'.


In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Load the dataset
df = pd.read_csv("calls_dataset.csv")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean and preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Remove stop words
    text = " ".join([word for word in text.split() if word not in stop_words])

    # Lemmatize the text
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

# Apply preprocessing to the text snippets
df["cleaned_text"] = df["text_snippet"].apply(preprocess_text)

# Save the cleaned dataset
df.to_csv("cleaned_calls_dataset.csv", index=False)
print("Text preprocessing complete. Cleaned dataset saved as 'cleaned_calls_dataset.csv'.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Text preprocessing complete. Cleaned dataset saved as 'cleaned_calls_dataset.csv'.


In [5]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the splits
train_df.to_csv("train_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)
print("Dataset split into training and test sets.")

Dataset split into training and test sets.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Load the training dataset
train_df = pd.read_csv("train_dataset.csv")

# Convert labels to binary format
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df["labels"].str.split(", "))

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_df["cleaned_text"])

# Train the model
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train, y_train)

print("Model training complete.")

Model training complete.


In [7]:
from sklearn.metrics import classification_report

# Load the test dataset
test_df = pd.read_csv("test_dataset.csv")

# Convert labels to binary format
y_test = mlb.transform(test_df["labels"].str.split(", "))

# Vectorize the test data
X_test = vectorizer.transform(test_df["cleaned_text"])

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

                    precision    recall  f1-score   support

       Competition       1.00      1.00      1.00         6
         Objection       1.00      1.00      1.00         6
          Positive       1.00      1.00      1.00         8
Pricing Discussion       1.00      1.00      1.00         9
          Security       1.00      1.00      1.00        11

         micro avg       1.00      1.00      1.00        40
         macro avg       1.00      1.00      1.00        40
      weighted avg       1.00      1.00      1.00        40
       samples avg       1.00      1.00      1.00        40



In [8]:
import json

# Define domain knowledge
domain_knowledge = {
    "competitors": ["CompetitorX", "CompetitorY", "CompetitorZ"],
    "features": ["analytics", "AI engine", "data pipeline"],
    "pricing_keywords": ["discount", "renewal cost", "budget", "pricing model"]
}

# Save to JSON
with open("domain_knowledge.json", "w") as f:
    json.dump(domain_knowledge, f, indent=4)

print("Domain knowledge saved as 'domain_knowledge.json'.")

Domain knowledge saved as 'domain_knowledge.json'.


In [9]:
import json
import pandas as pd

# Load domain knowledge
with open("domain_knowledge.json", "r") as f:
    domain_knowledge = json.load(f)

# Load the dataset
df = pd.read_csv("calls_dataset.csv")

# Function to perform dictionary lookup
def dictionary_lookup(text):
    entities = {
        "competitors": [],
        "features": [],
        "pricing_keywords": []
    }

    # Search for competitors
    for competitor in domain_knowledge["competitors"]:
        if competitor.lower() in text.lower():
            entities["competitors"].append(competitor)

    # Search for features
    for feature in domain_knowledge["features"]:
        if feature.lower() in text.lower():
            entities["features"].append(feature)

    # Search for pricing keywords
    for keyword in domain_knowledge["pricing_keywords"]:
        if keyword.lower() in text.lower():
            entities["pricing_keywords"].append(keyword)

    return entities

# Apply dictionary lookup to the dataset
df["extracted_entities"] = df["text_snippet"].apply(dictionary_lookup)

# Save the results
df.to_csv("calls_dataset_with_entities.csv", index=False)
print("Dictionary lookup complete. Results saved as 'calls_dataset_with_entities.csv'.")

Dictionary lookup complete. Results saved as 'calls_dataset_with_entities.csv'.


In [10]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER
def perform_ner(text):
    doc = nlp(text)
    entities = {
        "organizations": [],
        "dates": [],
        "locations": []
    }

    for ent in doc.ents:
        if ent.label_ == "ORG":
            entities["organizations"].append(ent.text)
        elif ent.label_ == "DATE":
            entities["dates"].append(ent.text)
        elif ent.label_ == "GPE":
            entities["locations"].append(ent.text)

    return entities

# Apply NER to the dataset
df["ner_entities"] = df["text_snippet"].apply(perform_ner)

# Save the results
df.to_csv("calls_dataset_with_ner.csv", index=False)
print("NER complete. Results saved as 'calls_dataset_with_ner.csv'.")

NER complete. Results saved as 'calls_dataset_with_ner.csv'.


In [11]:
# Combine dictionary lookup and NER results
df["final_entities"] = df.apply(
    lambda row: {
        "dictionary_lookup": row["extracted_entities"],
        "ner_entities": row["ner_entities"]
    },
    axis=1
)

# Save the final results
df.to_csv("calls_dataset_with_final_entities.csv", index=False)
print("Entity extraction complete. Final results saved as 'calls_dataset_with_final_entities.csv'.")

Entity extraction complete. Final results saved as 'calls_dataset_with_final_entities.csv'.


In [13]:
pip install "fastapi[standard]"

Collecting fastapi[standard]
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi[standard])
  Downloading starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Collecting fastapi-cli>=0.0.5 (from fastapi-cli[standard]>=0.0.5; extra == "standard"->fastapi[standard])
  Downloading fastapi_cli-0.0.7-py3-none-any.whl.metadata (6.2 kB)
Collecting python-multipart>=0.0.18 (from fastapi[standard])
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting email-validator>=2.0.0 (from fastapi[standard])
  Downloading email_validator-2.2.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.12.0 (from uvicorn[standard]>=0.12.0; extra == "standard"->fastapi[standard])
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting dnspython>=2.0.0 (from email-validator>=2.0.0->fastapi[standard])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting rich-toolkit>=0.11.1 (from

In [14]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import json
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Load domain knowledge
with open("domain_knowledge.json", "r") as f:
    domain_knowledge = json.load(f)

# Load the spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Load the trained multi-label classification model (dummy model for demonstration)
# In a real scenario, you would load a pre-trained model here.
vectorizer = TfidfVectorizer(max_features=1000)
mlb = MultiLabelBinarizer()
model = MultiOutputClassifier(LogisticRegression())

# Dummy training data (replace with actual training data)
train_df = pd.read_csv("train_dataset.csv")
y_train = mlb.fit_transform(train_df["labels"].str.split(", "))
X_train = vectorizer.fit_transform(train_df["cleaned_text"])
model.fit(X_train, y_train)

# Initialize FastAPI
app = FastAPI()

# Define input model
class Snippet(BaseModel):
    text: str

# Function for dictionary lookup
def dictionary_lookup(text):
    entities = {
        "competitors": [],
        "features": [],
        "pricing_keywords": []
    }

    for competitor in domain_knowledge["competitors"]:
        if competitor.lower() in text.lower():
            entities["competitors"].append(competitor)

    for feature in domain_knowledge["features"]:
        if feature.lower() in text.lower():
            entities["features"].append(feature)

    for keyword in domain_knowledge["pricing_keywords"]:
        if keyword.lower() in text.lower():
            entities["pricing_keywords"].append(keyword)

    return entities

# Function for NER
def perform_ner(text):
    doc = nlp(text)
    entities = {
        "organizations": [],
        "dates": [],
        "locations": []
    }

    for ent in doc.ents:
        if ent.label_ == "ORG":
            entities["organizations"].append(ent.text)
        elif ent.label_ == "DATE":
            entities["dates"].append(ent.text)
        elif ent.label_ == "GPE":
            entities["locations"].append(ent.text)

    return entities

# Function for summarization (dummy implementation)
def summarize_text(text):
    # In a real scenario, use a summarization model or algorithm
    return "This is a summary of the text snippet."

# API endpoint
@app.post("/predict")
async def predict(snippet: Snippet):
    text = snippet.text

    # Perform multi-label classification
    X = vectorizer.transform([text])
    predicted_labels = mlb.inverse_transform(model.predict(X))[0]

    # Perform entity extraction
    extracted_entities = dictionary_lookup(text)
    ner_entities = perform_ner(text)

    # Generate summary
    summary = summarize_text(text)

    return {
        "predicted_labels": predicted_labels,
        "extracted_entities": {
            "dictionary_lookup": extracted_entities,
            "ner_entities": ner_entities
        },
        "summary": summary
    }