In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/insiya24/dataset/train.csv")

In [None]:
df.shape

(100000, 6)

In [None]:
df.isnull().sum()

PRODUCT_ID             0
TITLE                  1
BULLET_POINTS      37188
DESCRIPTION        51570
PRODUCT_TYPE_ID        0
PRODUCT_LENGTH         0
dtype: int64

In [None]:
df.fillna("not mentioned")

df.shape

(100000, 6)

In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 


# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    text = str(text)
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if not token in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer() 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Return the preprocessed and tokenized text as a single string
    return ' '.join(lemmatized_tokens)


# Apply the preprocessing function to each column of text and store the union of tokens in a new column
df['TOKENS'] = df['TITLE'].apply(preprocess_text) + ' ' + df['BULLET_POINTS'].apply(preprocess_text) + ' ' + df['DESCRIPTION'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df['TOKENS']

0        artzfolio tulip flower blackout curtain door w...
1        mark spencer girl pyjama set t862561cnavy mix9...
2        priknik horn red electric air horn compressor ...
3        alishah woman cotton ankle length legging comb...
4        united empire loyalist chronicle great migrati...
                               ...                        
99995    soffe girl big core tank top surf blue small g...
99996    governance regulation international finance pr...
99997    interfaith alternative embracing spiritual div...
99998                               midnight rider nan nan
99999    muellery recycling bag recycle box bin waterpr...
Name: TOKENS, Length: 100000, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Perform TF-IDF encoding on the tokens column
tfidf = TfidfVectorizer(max_features=250)
x = tfidf.fit_transform(df['TOKENS'])


y = df['PRODUCT_LENGTH']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.225)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(
    min_delta=1, # minimium amount of change to count as an improvement
    patience=5, # how many epochs to wait before stopping
    restore_best_weights=True,
)

model = keras.Sequential([
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu', input_shape=[250]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1),
])

# Compile the model
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

In [None]:
# Train the model
model.fit(x_train.toarray(), y_train, 
          epochs=10, 
          batch_size=256,
          validation_data=(x_val.toarray(), y_val),
          callbacks=[early_stopping],
          )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fed580fdca0>

In [None]:
# Make predictions
y_pred = model.predict(x_val.toarray())

# Calculate the mean absolute percentage error on the validation set
mape = mean_absolute_percentage_error(y_val, y_pred)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Percentage Error: 0.8918938405594382


In [None]:
y_pred

array([[708.4234 ],
       [447.21732],
       [446.8015 ],
       ...,
       [576.57056],
       [536.9218 ],
       [542.58405]], dtype=float32)

In [None]:
test_df = pd.read_csv("https://raw.githubusercontent.com/insiya24/dataset/test.csv")

# Apply the preprocessing function to each column of text and store the union of tokens in a new column
test_df['TOKENS'] = test_df['TITLE'].apply(preprocess_text) + ' ' + test_df['BULLET_POINTS'].apply(preprocess_text) + ' ' + test_df['DESCRIPTION'].apply(preprocess_text)

# Perform TF-IDF encoding on the tokens column
tfidf = TfidfVectorizer(max_features=250)
x_test = tfidf.fit_transform(test_df['TOKENS'])

y_test = test_df['PRODUCT_LENGTH']

y_test_pred = model.predict(x_test.toarray())

# Calculate the mean absolute percentage error on the test set
mape = mean_absolute_percentage_error(y_test, y_test_pred)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Percentage Error: 1.1383301312039371


In [None]:
print('The predicted Lenght ', y_test_pred[26])

The predicted Lenght  [449.68082]
