In [1]:
import pandas as pd

df = pd.read_csv(r"D:\Jeeva\ds_course\insurance_final_project\data\cleaned_data\sentiment_cleaned_data.csv")
df.head()

Unnamed: 0,category,sentiment,clean_feedback
0,claim,negative,im extremely disappointed lengthy unsatisfacto...
1,claim,negative,inefficient communication lack transparency th...
2,claim,negative,recently terrible experience vehicle insurance...
3,claim,negative,extremely dissatisfied vehicle insurance claim...
4,claim,negative,poor communication throughout entire process u...


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# ---------------------------
# Columns
# ---------------------------
text_col = 'clean_feedback'
cat_cols = ['category']
target_col = 'sentiment'

X = df[[text_col] + cat_cols]
y = df[target_col]

# ---------------------------
# Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Preprocessing
# ---------------------------
preprocessor = ColumnTransformer([
    ('text', TfidfVectorizer(max_features=5000, ngram_range=(1,2)), text_col),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# ---------------------------
# Logistic Regression Model
# ---------------------------
logreg_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_logreg = logreg_pipeline.predict(X_test)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_logreg))

# ---------------------------
# Random Forest Model
# ---------------------------
rf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

# ---------------------------
# Export models
# ---------------------------
joblib.dump(logreg_pipeline, "sentiment_logreg_model.pkl")
joblib.dump(rf_pipeline, "sentiment_rf_model.pkl")

print("Models exported: sentiment_logreg_model.pkl, sentiment_rf_model.pkl")


Logistic Regression Results:
              precision    recall  f1-score   support

    negative       0.99      0.98      0.99       516
     neutral       0.97      0.97      0.97       491
    positive       0.98      0.99      0.98       510

    accuracy                           0.98      1517
   macro avg       0.98      0.98      0.98      1517
weighted avg       0.98      0.98      0.98      1517

Random Forest Results:
              precision    recall  f1-score   support

    negative       0.98      0.97      0.98       516
     neutral       0.96      0.95      0.96       491
    positive       0.97      0.98      0.97       510

    accuracy                           0.97      1517
   macro avg       0.97      0.97      0.97      1517
weighted avg       0.97      0.97      0.97      1517

Models exported: sentiment_logreg_model.pkl, sentiment_rf_model.pkl
