In [None]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


data = pd.read_excel("Book1.xlsx")
data

In [None]:
# Step 1: Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # Adjust the number of features as needed
    stop_words="english"  # Remove common English stopwords
)

# Step 2: Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_reviews'])

# Convert the sparse matrix to a dense matrix
tfidf_dense_matrix = tfidf_matrix.toarray()

# Create a DataFrame for the dense TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_dense_matrix, columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the original data and the TF-IDF matrix along the columns
final_data = pd.concat([data, tfidf_df], axis=1)

# Now the final_data DataFrame contains your original data plus the TF-IDF vectorized data
final_data

In [None]:

# Step 3: Choose and initialize the SVM model
svm_model = SVC(kernel="linear")  # You can adjust the kernel and other hyperparameters

# Step 1: Split the data into training and testing sets
train_data, test_data = train_test_split(final_data, test_size=0.2, random_state=42)

# Step 2: Select target and features for training and testing
target_column = "review_tag"
features_columns = tfidf_vectorizer.get_feature_names_out()  # Columns from the TF-IDF matrix

X_train = train_data[features_columns]
y_train = train_data[target_column]

X_test = test_data[features_columns]
y_test = test_data[target_column]

# Step 4: Train and evaluate the SVM model
svm_model.fit(X_train, y_train)
predictions = svm_model.predict(X_test)

# Evaluate the model using classification report
report = classification_report(y_test, predictions)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the results
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", report)


Interpreting the results:

•	The confusion matrix helps you understand how many predictions fall into each category (true positive, false positive, true negative, false negative).
•	Accuracy gives an overall percentage of correctly predicted instances.
•	Precision and recall provide insights into the quality of predictions for each class.
•	The F1-score balances precision and recall.
•	The classification report presents these metrics for both positive and negative classes.

