**Load Dataset**

In [None]:
#Load Dataset
import pandas as pd
df = pd.read_csv('AI_Project_ecommerceDataset.csv')
#df.head()
print(df)

**Clean Dataset**

In [None]:
df['description'] = df['description']
df.isnull().sum()
df['description'] = df['description'].fillna('')
df.head()

**Labels Information**

In [None]:
labels_count = df['category'].nunique()
print("Total Label =",labels_count)
print("\n")
labels = df['category'].unique()
print("Labels =",labels)

**Preprocessing**

In [None]:
!pip install spacy

In [None]:
import spacy
import string

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tokens = [token.lower() for token in tokens]
    tokens = [token.text for token in doc if not token.is_stop]
    tokens = [token for token in tokens if token not in string.punctuation]
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_tokens = [token for token in lemmatized_tokens if token.isalpha()]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [None]:
# Total 50425 datasets, so it takes too much times for run
df['description'] = df['description'].apply(preprocess_text)

**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])
df['category']

**Feature Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['description'])
Y = df['category']
print(X.shape)
print(Y.shape)

**Train-Text Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(solver = 'saga')
lr = lr.fit(X_train,Y_train)
lr_pred = lr.predict(X_test)
print(accuracy_score(Y_test, lr_pred))

In [None]:
# Train-Test Accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
print("\t***Train-Test Accuracy***")
print("\n")
LR = LogisticRegression(solver = 'saga') # 'lbfgs=0.9683', 'newton-cg=0.9683', 'sag=0.9683', 'liblinear=0.964', 'saga=0.9685'
LR_Model=LR.fit(X_train,Y_train)
LR_prediction =LR_Model.predict(X_test)
print("Logistic Regression Train Accuracy :", accuracy_score(Y_train,LR_Model.predict(X_train)))
print("Logistic Regression Test Accuracy  :", accuracy_score(Y_test, LR_prediction))

In [None]:
# Train-test Classification Report
from yellowbrick.classifier import ClassificationReport
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
print("\t***Train-Test Classification Report***")
print("\n")
print(classification_report(Y_test, LR_prediction))
print("\n")
print("\t***Train-Test Classification Report Display***")
print("\n")
plt.figure(figsize=(4, 2))
viz = ClassificationReport(LogisticRegression(solver = 'saga'), cmap='Oranges') #colors
viz.fit(X_train, Y_train)
viz.score(X_test, Y_test)
viz.show()

In [None]:
# Train-Test Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
print("\t***Train-Test Confusion Matrix***")
print("\n")
LR = confusion_matrix(Y_test, LR_prediction)
print(confusion_matrix(Y_test, LR_prediction))
print("\n")
# Display
print("\t***Train-Test Confusion Matrix Display***")
print("\n")
fig, ax = plot_confusion_matrix(conf_mat=LR,cmap='Oranges', class_names=labels, figsize=(3, 3))
plt.title('Confusion Matrix')
plt.show()

In [None]:
# 10 Fold Cross Validation Accuracy
from sklearn.model_selection import cross_val_score, cross_val_predict
print("\t***Cross Validation Accuracy***")
print("\n")
LR = LogisticRegression(solver = 'saga')
scores = cross_val_score(LR, X, Y, cv=10) #CV means K
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# 10 CV Classification Report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
print("\t***Cross validation Classification Report***")
print("\n")
predicted = cross_val_predict(LR, X, Y, cv=10)
print(classification_report(Y, predicted))

In [None]:
# 10 CV Confusion Matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
import matplotlib.pyplot as plt
import seaborn as sns
print("\t***Cross validation Confusion Matrix***")
print("\n")
predicted = cross_val_predict(LR, X, Y, cv=10)
conf_matrix = confusion_matrix(Y, predicted)
print(conf_matrix)
print("\n")
# Display
print("\t***Cross validation Confusion Matrix Display***")
print("\n")
plt.figure(figsize=(4, 2))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()