In [None]:
!pip install -U transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import string
import nltk
import re
import spacy
import torch

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline, set_seed

# Download necessary NLTK resources
nltk.download('stopwords')

# Load NLP tools
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_md")  # Medium-sized model with word vectors
stemmer = PorterStemmer()

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    stemmed_words = [stemmer.stem(word) for word in words]
    doc = nlp(" ".join(stemmed_words))
    lemmatized_words = [token.lemma_ for token in doc]
    return " ".join(lemmatized_words)

# Load dataset
df = pd.read_json("synthetic_customer_complaints.json", lines=True)

# Clean complaints
df['Cleaned_Complaint'] = df['Complaint'].apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Complaint']).toarray()
y = df['Category']

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Predict category function
def predict_category(new_text):
    cleaned = clean_text(new_text)
    vectorized = vectorizer.transform([cleaned])
    prediction = rf_model.predict(vectorized)
    return prediction[0]

# Load transformer model for solution generation
print("Loading Transformer model (FLAN-T5)...")
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model="google/flan-t5-small", device=device)
set_seed(42)

# Generate solution function (improved prompt)
def generate_solution(complaint, category):
    prompt = f"Customer complaint: {complaint}\nIssue category: {category}\nWhat should be the resolution?"
    try:
        result = generator(prompt, max_length=100, num_return_sequences=1)
        return result[0]['generated_text']
    except Exception:
        return "Error generating solution."

# === Step 1: Generate sample predictions from dataset ===
sample_df = df.head(10).copy()
sample_df['Predicted_Category'] = sample_df['Complaint'].apply(predict_category)
sample_df['Predicted_Solution'] = sample_df.apply(
    lambda row: generate_solution(row['Complaint'], row['Predicted_Category']), axis=1
)

# Show sample results
print("\n=== Sample Dataset Output ===")
print(sample_df[['Complaint', 'Predicted_Category']])

# === Step 2: Allow user input for dynamic prediction ===
print("\n=== User Input ===")
user_input = input("Enter a new customer complaint: ")

user_category = predict_category(user_input)
user_solution = generate_solution(user_input, user_category)

print("\nPredicted Category:", user_category)
print("Suggested Resolution:", user_solution)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Transformer model (FLAN-T5)...


Device set to use cuda:0



=== Sample Dataset Output ===
                                           Complaint    Predicted_Category
0  Flight cancellation. Report reduce red kid soc...   Flight cancellation
1  Appointment delay. Western much whose compare ...     Appointment delay
2  Seat not assigned. Big try site generation voi...     Seat not assigned
3  Account locked. Treatment together past box pu...        Account locked
4  Flight cancellation. Feel threat skill profess...   Flight cancellation
5  Wrong billing. None somebody region then time ...         Wrong billing
6  Claim rejection. Loss pass name whatever respo...       Claim rejection
7  Damaged product. Well start Mr then place forg...       Damaged product
8  Claim rejection. Little commercial life perhap...       Claim rejection
9  Refund not processed. Scientist like door conf...  Refund not processed

=== User Input ===
Enter a new customer complaint: I have cancelled my bus tickets  due to time

Predicted Category: Flight cancellation
Sugges

In [None]:
import pandas as pd
import string
import nltk
import re
import spacy
import torch

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import pipeline, set_seed

# Download NLTK stopwords
nltk.download('stopwords')

# Load NLP tools
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_md")  # Medium model with word vectors
stemmer = PorterStemmer()

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    stemmed_words = [stemmer.stem(word) for word in words]  # Stemming
    doc = nlp(" ".join(stemmed_words))
    lemmatized_words = [token.lemma_ for token in doc]  # Lemmatization
    return " ".join(lemmatized_words)

# Load dataset
df = pd.read_json("synthetic_customer_complaints.json", lines=True)

# Clean complaints
df['Cleaned_Complaint'] = df['Complaint'].apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Complaint']).toarray()
y = df['Category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n=== Model Accuracy on Test Set: {accuracy * 100:.2f}% ===")
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# Category prediction
def predict_category(new_text):
    cleaned = clean_text(new_text)
    vectorized = vectorizer.transform([cleaned])
    prediction = rf_model.predict(vectorized)
    return prediction[0]

# Load FLAN-T5-Large model
print("\nLoading Transformer model (FLAN-T5 Large)... This may take time.")
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model="google/flan-t5-large", device=device)
set_seed(42)

# Resolution generator
def generate_solution(complaint, category):
    prompt = (
        f"You are a customer service assistant.\n"
        f"Customer complaint: \"{complaint}\"\n"
        f"Issue category: {category}\n"
        f"Provide a polite and helpful resolution:"
    )
    try:
        result = generator(prompt, max_length=150, num_return_sequences=1, do_sample=True, temperature=0.7)
        return result[0]['generated_text']
    except Exception as e:
        return f"Error generating solution: {e}"

# Sample prediction
sample_df = df.head(10).copy()
sample_df['Predicted_Category'] = sample_df['Complaint'].apply(predict_category)
sample_df['Predicted_Solution'] = sample_df.apply(
    lambda row: generate_solution(row['Complaint'], row['Predicted_Category']), axis=1
)

print("\n=== Sample Dataset Output ===")
print(sample_df[['Complaint', 'Predicted_Category']])

# User input
print("\n=== User Input ===")
user_input = input("Enter a new customer complaint: ")

user_category = predict_category(user_input)
user_solution = generate_solution(user_input, user_category)

print("\nPredicted Category:", user_category)
print("Suggested Resolution:", user_solution)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== Model Accuracy on Test Set: 99.70% ===

=== Classification Report ===
                          precision    recall  f1-score   support

          Account locked       1.00      1.00      1.00        63
               App crash       1.00      1.00      1.00        55
       Appointment delay       1.00      1.00      1.00        54
           Booking issue       1.00      1.00      1.00        48
         Claim rejection       1.00      1.00      1.00        61
  Coverage not explained       1.00      1.00      1.00        71
         Damaged product       1.00      1.00      1.00        63
               Data loss       1.00      1.00      1.00        56
     Flight cancellation       1.00      1.00      1.00        66
             Hidden fees       1.00      1.00      1.00        65
               High bill       1.00      0.99      0.99        73
        Insurance denial       1.00      1.00      1.00        77
           Late delivery       1.00      1.00      1.00        64


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0



=== Sample Dataset Output ===
                                           Complaint    Predicted_Category
0  Flight cancellation. Report reduce red kid soc...   Flight cancellation
1  Appointment delay. Western much whose compare ...     Appointment delay
2  Seat not assigned. Big try site generation voi...     Seat not assigned
3  Account locked. Treatment together past box pu...        Account locked
4  Flight cancellation. Feel threat skill profess...   Flight cancellation
5  Wrong billing. None somebody region then time ...         Wrong billing
6  Claim rejection. Loss pass name whatever respo...       Claim rejection
7  Damaged product. Well start Mr then place forg...       Damaged product
8  Claim rejection. Little commercial life perhap...       Claim rejection
9  Refund not processed. Scientist like door conf...  Refund not processed

=== User Input ===
Enter a new customer complaint: I didn't receive my loan disbursement on time.

Predicted Category: Loan processing delay
Su