In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
dataset = '/content/drive/MyDrive/ecommerceDataset.csv'

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

In [4]:
df = pd.read_csv(dataset, names = ['category','desc'])


In [5]:
df.isnull().sum()

category    0
desc        1
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df['category'] = df['category'].replace('Clothing & Accessories', 'ClothingAndAccessories')
df['combined_columns'] = df.apply(lambda row: '_'.join([str(row['category']), str(row['desc'])]), axis=1)


In [8]:
df.head()

Unnamed: 0,category,desc,combined_columns
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,Household_Paper Plane Design Framed Wall Hangi...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...","Household_SAF 'Floral' Framed Painting (Wood, ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,Household_SAF 'UV Textured Modern Art Print Fr...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",Household_SAF Flower Print Framed Painting (Sy...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,Household_Incredible Gifts India Wooden Happy ...


In [9]:
category_counts = df['category'].value_counts()
print(category_counts)

Household                 19313
Books                     11820
Electronics               10621
ClothingAndAccessories     8670
Name: category, dtype: int64


In [10]:
df1 = df[df['category'] == 'Household'].sample(n=8670)
df2 = df[df['category'] == 'Books'].sample(n=8670)
df3 = df[df['category'] == 'Electronics'].sample(n=8670)
df4 = df[df['category'] == 'ClothingAndAccessories'].sample(n=8670)

In [11]:
df = pd.concat([df1, df2, df3, df4], axis=0)
df.head()

Unnamed: 0,category,desc,combined_columns
8976,Household,EverEx™ Adjustable pan and Pot Rack Holder Dis...,Household_EverEx™ Adjustable pan and Pot Rack ...
6560,Household,Fresh From Loom 500 Tc Chenille diven Sets 8 p...,Household_Fresh From Loom 500 Tc Chenille dive...
9881,Household,KINSHIP INDIA Microwave Safe Stainless Steel S...,Household_KINSHIP INDIA Microwave Safe Stainle...
5611,Household,Mandala Décor Hand Printed Pure Cotton Jaipuri...,Household_Mandala Décor Hand Printed Pure Cott...
4864,Household,Art Street - Set of 6 Individual Black Wall Ph...,Household_Art Street - Set of 6 Individual Bla...


In [12]:
from sklearn.utils import shuffle
df = shuffle(df)
df.head()

Unnamed: 0,category,desc,combined_columns
28939,Books,Essentials of Medical Pharmacology About the A...,Books_Essentials of Medical Pharmacology About...
49643,Electronics,Captcha Compact and Lightweight Pocket Size Lo...,Electronics_Captcha Compact and Lightweight Po...
4853,Household,Sehaz Artworks 'ILoveYou' Wood Pasted Photo Al...,Household_Sehaz Artworks 'ILoveYou' Wood Paste...
1215,Household,PAffy Steel and Fabric Multi-Purpose Shoe Rack...,Household_PAffy Steel and Fabric Multi-Purpose...
25783,Books,"Man's Search for Meaning Review ""An enduring w...","Books_Man's Search for Meaning Review ""An endu..."


In [13]:
category_counts = df['category'].value_counts()
print(category_counts)

Books                     8670
Electronics               8670
Household                 8670
ClothingAndAccessories    8670
Name: category, dtype: int64


In [14]:
df['category'].unique()

array(['Books', 'Electronics', 'Household', 'ClothingAndAccessories'],
      dtype=object)

In [15]:
train, test = train_test_split(df, test_size = 0.25)

In [16]:

import re
with open('train_output.txt', 'w') as file:
    for text in train['desc']:
        # replaces all characters that are not word characters (letters and digits), spaces, or single quotes with a space.
        text = re.sub(r'[^\w\s\']',' ', text)

        # replaces multiple consecutive spaces with a single space
        text = re.sub(' +', ' ', text)

        # strip any leading or trailing whitespace from the text and convert the text to lowercase
        text = text.strip().lower()

        # splits the text into lines (separated by newline characters) and then joins them back together with a space.
        text_without_newline = ' '.join(text.splitlines())

        # words = text_without_newline.split()

        # # Remove stopwords and apply stemming (lemmatization)
        # filtered_text = ' '.join([stemmer.stem(word) for word in words if word not in STOP_WORDS])

        file.write(text_without_newline + '\n')

In this updated example, the splitlines() method is used to split the text into lines, and then join() is used to concatenate the lines back into a single string, separating them with a space. This effectively removes the newline characters (\n) within the text while preserving the line breaks at the end of each observation.

Please give this code a try, and the resulting "output.txt" file should have newline characters removed within the text while maintaining the line breaks at the end of each observation.

In [17]:
with open('train_label.txt', 'w') as file:
    for text in train['category']:
        # replaces all characters that are not word characters (letters and digits), spaces, or single quotes with a space.
        text = re.sub(r'[^\w\s\']',' ', text)

        # replaces multiple consecutive spaces with a single space
        text = re.sub(' +', ' ', text)

        # strip any leading or trailing whitespace from the text and convert the text to lowercase
        text = text.strip().lower()

        # splits the text into lines (separated by newline characters) and then joins them back together with a space.
        text_without_newline = ' '.join(text.splitlines())

        # words = text_without_newline.split()

        # # Remove stopwords and apply stemming (lemmatization)
        # filtered_text = ' '.join([stemmer.stem(word) for word in words if word not in STOP_WORDS])

        file.write(text_without_newline + '\n')

In [18]:
with open('test_output.txt', 'w') as file:
    for text in test['desc']:
        # replaces all characters that are not word characters (letters and digits), spaces, or single quotes with a space.
        text = re.sub(r'[^\w\s\']',' ', text)

        # replaces multiple consecutive spaces with a single space
        text = re.sub(' +', ' ', text)

        # strip any leading or trailing whitespace from the text and convert the text to lowercase
        text = text.strip().lower()

        # splits the text into lines (separated by newline characters) and then joins them back together with a space.
        text_without_newline = ' '.join(text.splitlines())

        # words = text_without_newline.split()

        # # Remove stopwords and apply stemming (lemmatization)
        # filtered_text = ' '.join([stemmer.stem(word) for word in words if word not in STOP_WORDS])

        file.write(text_without_newline + '\n')

In [19]:
with open('test_label.txt', 'w') as file:
    for text in test['category']:
        # replaces all characters that are not word characters (letters and digits), spaces, or single quotes with a space.
        text = re.sub(r'[^\w\s\']',' ', text)

        # replaces multiple consecutive spaces with a single space
        text = re.sub(' +', ' ', text)

        # strip any leading or trailing whitespace from the text and convert the text to lowercase
        text = text.strip().lower()

        # splits the text into lines (separated by newline characters) and then joins them back together with a space.
        text_without_newline = ' '.join(text.splitlines())

        # words = text_without_newline.split()

        # # Remove stopwords and apply stemming (lemmatization)
        # filtered_text = ' '.join([stemmer.stem(word) for word in words if word not in STOP_WORDS])

        file.write(text_without_newline + '\n')

In [20]:
df_text = df['combined_columns'].apply(lambda x: str(x))
train_text = train['combined_columns'].apply(lambda x: str(x))
test_text = test['combined_columns'].apply(lambda x: str(x))

In [21]:
df_text.to_csv('/content/drive/MyDrive/df_text.txt', index=False, header=False)
train_text.to_csv('/content/drive/MyDrive/train_text.txt', index=False, header=False)
test_text.to_csv('/content/drive/MyDrive/test_text.txt', index=False, header=False)

In [22]:
train_data = '/content/drive/MyDrive/train_text.txt'
test_data = '/content/drive/MyDrive/test_text.txt'

In [23]:
train_labelsss=train['category'].astype(str)
train_labelsss = train_labelsss.str.replace('\n', '')
train_labelsss.to_csv('train_labelsss.txt', index=False, header=False)

tain_texttt = train['desc'].astype(str)
tain_texttt = tain_texttt.str.replace('\n', '')
tain_texttt.to_csv('tain_texttt.txt', index=False, header=False)

test_labelsss=test['category'].astype(str)
test_labelsss = test_labelsss.str.replace('\n', '')
test_labelsss.to_csv('test_labelsss.txt', index=False, header=False)

test_texttt = test['desc'].astype(str)
test_texttt = test_texttt.str.replace('\n', '')
test_texttt.to_csv('test_texttt.txt', index=False, header=False)

In [24]:
train_text = '/content/train_output.txt'
test_text = '/content/test_output.txt'
train_label = '/content/train_label.txt'
test_label = '/content/test_label.txt'

In [25]:
X, y = [], []
with open(train_text, "r") as infile:
    for line in infile:
        X.append(line.split())

X= np.array(X)
print ("total examples %s" % len(X))

with open(train_label, "r") as infile:
    for line in infile:
        y.append(line)

y= np.array(y)
print ("total examples %s" % len(y))

total examples 26010
total examples 26010


  X= np.array(X)


In [26]:
X_test, y_test = [], []
with open(test_text, "r") as infile:
    for line in infile:
        X_test.append(line.split())

X_test= np.array(X_test)
print ("total examples %s" % len(X_test))

with open(test_label, "r") as infile:
    for line in infile:
        y_test.append(line)

y_test= np.array(y_test)
print ("total examples %s" % len(y_test))

total examples 8670
total examples 8670


  X_test= np.array(X_test)


In [27]:
y_tr =[]
for i in y:
  a = i[:-1]
  y_tr.append(a)

y_te =[]
for i in y_test:
  a = i[:-1]
  y_te.append(a)

In [28]:
import string
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [29]:
def sent_vec(sent,w2v):
    vector_size = w2v.vector_size
    wv_res = np.zeros(vector_size)
    ctr = 1
    for w in sent:
        if w in w2v:
            ctr += 1
            wv_res += w2v[w]
    wv_res = wv_res/ctr
    return wv_res

In [30]:
# import gensim.downloader as api
# wv = api.load('fasttext-wiki-news-subwords-300')

# import pickle
# # Specify the path for saving the KeyedVectors object
# save_path = '/content/drive/MyDrive/api_file.pkl'
# # Save the KeyedVectors object
# with open(save_path, 'wb') as file:
#     pickle.dump(wv, file)


import pickle
# Specify the path of the pickle file
load_path = '/content/drive/MyDrive/api_file.pkl'
# Load the KeyedVectors object
with open(load_path, 'rb') as file:
    wv = pickle.load(file)

In [31]:
train_doc_vecs = []
for doc in X:
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec(doc_words,wv))

In [32]:
test_doc_vecs = []
for doc in X_test:
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec(doc_words,wv))

In [33]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_doc_vecs,y_tr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
from sklearn.metrics import accuracy_score
pred_train_ys = logreg.predict(train_doc_vecs)
pred_test_ys = logreg.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y_tr))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_te))

Train accuracy:  0.9269511726259131
Test accuracy:  0.9231833910034603


In [35]:
print(classification_report(pred_test_ys, y_te))


                        precision    recall  f1-score   support

                 books       0.93      0.94      0.94      2159
clothingandaccessories       0.95      0.95      0.95      2166
           electronics       0.90      0.93      0.92      2090
             household       0.91      0.87      0.89      2255

              accuracy                           0.92      8670
             macro avg       0.92      0.92      0.92      8670
          weighted avg       0.92      0.92      0.92      8670



In [37]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(train_doc_vecs, y)

In [38]:
pred_train_ys = clf.predict(train_doc_vecs)
pred_test_ys = clf.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_test))

Train accuracy:  0.9570549788542868
Test accuracy:  0.941522491349481


In [54]:
print(classification_report(pred_test_ys, y_test))


                         precision    recall  f1-score   support

                 books
       0.95      0.96      0.95      2163
clothingandaccessories
       0.97      0.96      0.96      2176
           electronics
       0.92      0.94      0.93      2112
             household
       0.93      0.90      0.92      2219

               accuracy                           0.94      8670
              macro avg       0.94      0.94      0.94      8670
           weighted avg       0.94      0.94      0.94      8670



In [40]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [41]:
# ! pip install fasttext

In [42]:
df.category='__label__'+df.category.astype(str)
df.head(5)

Unnamed: 0,category,desc,combined_columns
28939,__label__Books,Essentials of Medical Pharmacology About the A...,Books_Essentials of Medical Pharmacology About...
49643,__label__Electronics,Captcha Compact and Lightweight Pocket Size Lo...,Electronics_Captcha Compact and Lightweight Po...
4853,__label__Household,Sehaz Artworks 'ILoveYou' Wood Pasted Photo Al...,Household_Sehaz Artworks 'ILoveYou' Wood Paste...
1215,__label__Household,PAffy Steel and Fabric Multi-Purpose Shoe Rack...,Household_PAffy Steel and Fabric Multi-Purpose...
25783,__label__Books,"Man's Search for Meaning Review ""An enduring w...","Books_Man's Search for Meaning Review ""An endu..."


In [43]:
df['cat_desc']=df.category+" "+df.desc


In [45]:
df['cat_desc']=df.cat_desc.apply(lambda text: preprocess(text))


In [46]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.25)

In [47]:
train.to_csv("ecommerce.train", columns=["cat_desc"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["cat_desc"], index=False, header=False)

In [57]:
import fasttext
model=fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(8667, 0.9621553017191646, 0.9621553017191646)