In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Load the dataset
file_path = "sustainable_fashion_final_vertex_eval.csv"
df = pd.read_csv(file_path)

# Basic text preprocessing
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
        return text
    return ""

df["cleaned_text"] = df["text"].apply(clean_text)

# Drop rows with missing labels
df = df.dropna(subset=["label"])

# Convert labels to binary (0 = Not Sustainable, 1 = Sustainable)
df["label"] = df["label"].map({"not_sustainable": 0, "sustainable": 1})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model
log_model = LogisticRegression()
log_model.fit(X_train_tfidf, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Train Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_log = log_model.predict(X_test_tfidf)
y_pred_rf = rf_model.predict(X_test_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)
y_pred_gb = gb_model.predict(X_test_tfidf)

# Evaluate the models
accuracy_log = accuracy_score(y_test, y_pred_log)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

print("Logistic Regression Accuracy:", accuracy_log)
print("Random Forest Accuracy:", accuracy_rf)
print("SVM Accuracy:", accuracy_svm)
print("Gradient Boosting Accuracy:", accuracy_gb)

print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_log))
print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Classification Report (SVM):\n", classification_report(y_test, y_pred_svm))
print("Classification Report (Gradient Boosting):\n", classification_report(y_test, y_pred_gb))

# Confusion Matrix Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
models = [(log_model, y_pred_log, "Logistic Regression"),
          (rf_model, y_pred_rf, "Random Forest"),
          (svm_model, y_pred_svm, "SVM"),
          (gb_model, y_pred_gb, "Gradient Boosting")]

for ax, (model, y_pred, title) in zip(axes.flatten(), models):
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm', xticklabels=["Not Sustainable", "Sustainable"], yticklabels=["Not Sustainable", "Sustainable"], ax=ax)
    ax.set_title(f"{title} Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

plt.tight_layout()
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 7))
for model, name in [(log_model, "Logistic Regression"), (rf_model, "Random Forest"), (svm_model, "SVM"), (gb_model, "Gradient Boosting")]:
    y_probs = model.predict_proba(X_test_tfidf)[:, 1]  # Get positive class probability
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    plt.plot(fpr, tpr, label=name)

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Model Comparison")
plt.legend()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'sustainable_fashion_final_vertex_eval.csv'

In [6]:
import pickle

# Save the trained model
with open("gradient_boosting_model.pkl", "wb") as model_file:
    pickle.dump(gb_model, model_file)

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


In [7]:
# Load the trained model and vectorizer
with open("gradient_boosting_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)


In [8]:
def predict_sustainability(text):
    text_cleaned = clean_text(text)  # Apply the same preprocessing
    text_tfidf = loaded_vectorizer.transform([text_cleaned])  # Convert to TF-IDF features
    prediction = loaded_model.predict(text_tfidf)  # Get the predicted class
    return "Sustainable" if prediction[0] == 1 else "Not Sustainable"

# Example usage:
new_text = 'cotton tshirt'
print(predict_sustainability(new_text))  # Output: "Sustainable" or "Not Sustainable"


Sustainable
